当前位置：首页 > news >正文

UVa 1597 Searching the Web

news 2025/11/3 6:38:46

题目分析

问题描述

本题要求实现一个小型搜索引擎，基于倒排索引结构处理布尔查询。系统需要从给定的文档集合中构建索引，并支持四种查询类型：

单个词查询
AND 查询（两个词的交集）
OR 查询（两个词的并集）
NOT 查询（不包含某个词的文档）

输入格式

文档数量 $N$ ( $0 < N < 100$ )
$N$ 个文档，每个文档以 ********** 结束
查询数量 $M$ ( $\leq 50000$ )
$M$ 个查询，每行一个

输出要求

对于普通查询，输出包含查询词的行
对于 NOT 查询，输出整个文档
文档间用 ---------- 分隔
每个查询结果以 ========== 结束
无结果时输出 Sorry, I found nothing.

关键约束

词法分析：非字母字符分隔单词，忽略大小写
停用词："the", "a", "to", "and", "or", "not" 不参与索引和查询
不考虑词形变化和连字符组合

解题思路

1. 数据结构设计

使用以下核心数据结构：

vector<vector<string>> documents：存储原始文档内容
unordered_map<string, vector<pair<int, int>>> invertedIndex：倒排索引，键为词，值为 (文档ID, 行号) 列表
unordered_set<string> stopWords：停用词集合

2. 索引构建流程

文档读取：逐行读取文档，遇到 ********** 结束
词法分析：
- 转换为小写
- 按非字母字符分词
- 过滤停用词
索引更新：对每个有效词，记录其出现的 (文档ID, 行号)

3. 查询处理策略

单个词查询

直接从倒排索引获取对应列表，按文档和行号排序输出

$AND\texttt{AND}$ 查询

分别获取两个词的文档集合
求交集（两个词都出现的文档）
合并匹配行并去重输出

$OR\texttt{OR}$ 查询

分别获取两个词的文档集合
求并集（任一词出现的文档）
合并匹配行并去重输出

$NOT\texttt{NOT}$ 查询

获取查询词的文档集合
输出所有不包含该词的完整文档

4. 输出处理优化

使用 set 自动去重和排序
维护文档顺序和行号顺序
正确处理分隔符的插入时机

参考代码

// Searching the Web
// UVa ID: 1597
// Verdict: Accepted
// Submission Date: 2025-11-01
// UVa Run Time: 0.030s
//
// 版权所有（C）2025，邱秋。metaphysis # yeah dot net#include <bits/stdc++.h>using namespace std;// 停用词集合
const unordered_set<string> STOP_WORDS = {"the", "a", "to", "and", "or", "not"};// 将字符串转为小写
string toLowerCase(const string& inputStr) {string result = inputStr;for (char& ch : result) {ch = tolower(ch);}return result;
}// 分词函数：按非字母字符切分
vector<string> tokenizeLine(const string& lineText) {vector<string> tokens;string currentToken;for (char ch : lineText) {if (isalpha(ch)) {currentToken += tolower(ch);} else {if (!currentToken.empty()) {tokens.push_back(currentToken);currentToken.clear();}}}if (!currentToken.empty()) {tokens.push_back(currentToken);}return tokens;
}int main() {ios::sync_with_stdio(false);cin.tie(nullptr);int docCount;cin >> docCount;cin.ignore(); // 忽略换行符vector<vector<string>> documents(docCount); // 存储所有文档内容unordered_map<string, vector<pair<int, int>>> invertedIndex; // 倒排索引：词 -> [(文档ID, 行号)]// 读取并处理所有文档for (int docId = 0; docId < docCount; ++docId) {string line;while (getline(cin, line)) {if (line == "**********") break; // 文档结束标记documents[docId].push_back(line);// 分词并构建索引vector<string> words = tokenizeLine(line);for (const string& word : words) {// 过滤停用词if (STOP_WORDS.find(word) == STOP_WORDS.end()) {invertedIndex[word].emplace_back(docId, documents[docId].size() - 1);}}}}int queryCount;cin >> queryCount;cin.ignore();// 处理每个查询for (int queryIdx = 0; queryIdx < queryCount; ++queryIdx) {string queryStr;getline(cin, queryStr);// 根据查询类型分别处理if (queryStr.find("AND") != string::npos) {// AND 查询：term1 AND term2size_t andPos = queryStr.find(" AND ");string term1 = queryStr.substr(0, andPos);string term2 = queryStr.substr(andPos + 5);auto& term1List = invertedIndex[term1];auto& term2List = invertedIndex[term2];// 标记包含每个词的文档vector<bool> hasTerm1(docCount, false), hasTerm2(docCount, false);for (const auto& entry : term1List) hasTerm1[entry.first] = true;for (const auto& entry : term2List) hasTerm2[entry.first] = true;// 找出同时包含两个词的文档vector<int> matchingDocs;for (int docId = 0; docId < docCount; ++docId) {if (hasTerm1[docId] && hasTerm2[docId]) {matchingDocs.push_back(docId);}}// 输出匹配的行bool foundMatch = false;for (size_t i = 0; i < matchingDocs.size(); ++i) {int docId = matchingDocs[i];if (i > 0) cout << "----------\n";// 收集匹配行并去重set<int> matchedLines;for (const auto& entry : term1List) {if (entry.first == docId) {matchedLines.insert(entry.second);}}for (const auto& entry : term2List) {if (entry.first == docId) {matchedLines.insert(entry.second);}}// 输出匹配行for (int lineNum : matchedLines) {cout << documents[docId][lineNum] << '\n';foundMatch = true;}}if (!foundMatch) {cout << "Sorry, I found nothing.\n";}} else if (queryStr.find("OR") != string::npos) {// OR 查询：term1 OR term2size_t orPos = queryStr.find(" OR ");string term1 = queryStr.substr(0, orPos);string term2 = queryStr.substr(orPos + 4);auto& term1List = invertedIndex[term1];auto& term2List = invertedIndex[term2];// 标记包含任一词的文档vector<bool> matchedDocs(docCount, false);for (const auto& entry : term1List) matchedDocs[entry.first] = true;for (const auto& entry : term2List) matchedDocs[entry.first] = true;vector<int> docsInOrder;for (int docId = 0; docId < docCount; ++docId) {if (matchedDocs[docId]) docsInOrder.push_back(docId);}// 输出匹配的行bool foundMatch = false;for (size_t i = 0; i < docsInOrder.size(); ++i) {int docId = docsInOrder[i];if (i > 0) cout << "----------\n";// 收集匹配行并去重set<int> matchedLines;for (const auto& entry : term1List) {if (entry.first == docId) {matchedLines.insert(entry.second);}}for (const auto& entry : term2List) {if (entry.first == docId) {matchedLines.insert(entry.second);}}// 输出匹配行for (int lineNum : matchedLines) {cout << documents[docId][lineNum] << '\n';foundMatch = true;}}if (!foundMatch) {cout << "Sorry, I found nothing.\n";}} else if (queryStr.find("NOT") != string::npos) {// NOT 查询：NOT termstring term = queryStr.substr(4);// 标记包含该词的文档vector<bool> excludeDocs(docCount, false);if (invertedIndex.find(term) != invertedIndex.end()) {for (const auto& entry : invertedIndex[term]) {excludeDocs[entry.first] = true;}}// 输出不包含该词的完整文档bool foundMatch = false;for (int docId = 0; docId < docCount; ++docId) {if (!excludeDocs[docId]) {if (foundMatch) cout << "----------\n";for (const string& line : documents[docId]) {cout << line << '\n';}foundMatch = true;}}if (!foundMatch) {cout << "Sorry, I found nothing.\n";}} else {// 单个词查询string term = queryStr;if (invertedIndex.find(term) == invertedIndex.end() || invertedIndex[term].empty()) {cout << "Sorry, I found nothing.\n";} else {auto& termList = invertedIndex[term];// 去重并排序set<pair<int, int>> uniqueEntries;for (const auto& entry : termList) {uniqueEntries.insert(entry);}// 按文档和行号顺序输出int prevDocId = -1;for (const auto& entry : uniqueEntries) {int docId = entry.first;int lineNum = entry.second;if (docId != prevDocId) {if (prevDocId != -1) cout << "----------\n";prevDocId = docId;}cout << documents[docId][lineNum] << '\n';}}}// 每个查询结果结束标记cout << "==========\n";}return 0;
}