UVa 1597 Searching the Web
题目分析
问题描述
本题要求实现一个小型搜索引擎,基于倒排索引结构处理布尔查询。系统需要从给定的文档集合中构建索引,并支持四种查询类型:
- 单个词查询
AND查询(两个词的交集)OR查询(两个词的并集)NOT查询(不包含某个词的文档)
输入格式
- 文档数量 NNN (0<N<1000 < N < 1000<N<100)
- NNN 个文档,每个文档以
**********结束 - 查询数量 MMM (0<M≤500000 < M \leq 500000<M≤50000)
- MMM 个查询,每行一个
输出要求
- 对于普通查询,输出包含查询词的行
- 对于
NOT查询,输出整个文档 - 文档间用
----------分隔 - 每个查询结果以
==========结束 - 无结果时输出
Sorry, I found nothing.
关键约束
- 词法分析:非字母字符分隔单词,忽略大小写
- 停用词:
"the", "a", "to", "and", "or", "not"不参与索引和查询 - 不考虑词形变化和连字符组合
解题思路
1. 数据结构设计
使用以下核心数据结构:
vector<vector<string>> documents:存储原始文档内容unordered_map<string, vector<pair<int, int>>> invertedIndex:倒排索引,键为词,值为(文档ID, 行号)列表unordered_set<string> stopWords:停用词集合
2. 索引构建流程
- 文档读取:逐行读取文档,遇到
**********结束 - 词法分析:
- 转换为小写
- 按非字母字符分词
- 过滤停用词
- 索引更新:对每个有效词,记录其出现的
(文档ID, 行号)
3. 查询处理策略
单个词查询
直接从倒排索引获取对应列表,按文档和行号排序输出
AND\texttt{AND}AND 查询
- 分别获取两个词的文档集合
- 求交集(两个词都出现的文档)
- 合并匹配行并去重输出
OR\texttt{OR}OR 查询
- 分别获取两个词的文档集合
- 求并集(任一词出现的文档)
- 合并匹配行并去重输出
NOT\texttt{NOT}NOT 查询
- 获取查询词的文档集合
- 输出所有不包含该词的完整文档
4. 输出处理优化
- 使用
set自动去重和排序 - 维护文档顺序和行号顺序
- 正确处理分隔符的插入时机
参考代码
// Searching the Web
// UVa ID: 1597
// Verdict: Accepted
// Submission Date: 2025-11-01
// UVa Run Time: 0.030s
//
// 版权所有(C)2025,邱秋。metaphysis # yeah dot net#include <bits/stdc++.h>using namespace std;// 停用词集合
const unordered_set<string> STOP_WORDS = {"the", "a", "to", "and", "or", "not"};// 将字符串转为小写
string toLowerCase(const string& inputStr) {string result = inputStr;for (char& ch : result) {ch = tolower(ch);}return result;
}// 分词函数:按非字母字符切分
vector<string> tokenizeLine(const string& lineText) {vector<string> tokens;string currentToken;for (char ch : lineText) {if (isalpha(ch)) {currentToken += tolower(ch);} else {if (!currentToken.empty()) {tokens.push_back(currentToken);currentToken.clear();}}}if (!currentToken.empty()) {tokens.push_back(currentToken);}return tokens;
}int main() {ios::sync_with_stdio(false);cin.tie(nullptr);int docCount;cin >> docCount;cin.ignore(); // 忽略换行符vector<vector<string>> documents(docCount); // 存储所有文档内容unordered_map<string, vector<pair<int, int>>> invertedIndex; // 倒排索引:词 -> [(文档ID, 行号)]// 读取并处理所有文档for (int docId = 0; docId < docCount; ++docId) {string line;while (getline(cin, line)) {if (line == "**********") break; // 文档结束标记documents[docId].push_back(line);// 分词并构建索引vector<string> words = tokenizeLine(line);for (const string& word : words) {// 过滤停用词if (STOP_WORDS.find(word) == STOP_WORDS.end()) {invertedIndex[word].emplace_back(docId, documents[docId].size() - 1);}}}}int queryCount;cin >> queryCount;cin.ignore();// 处理每个查询for (int queryIdx = 0; queryIdx < queryCount; ++queryIdx) {string queryStr;getline(cin, queryStr);// 根据查询类型分别处理if (queryStr.find("AND") != string::npos) {// AND 查询:term1 AND term2size_t andPos = queryStr.find(" AND ");string term1 = queryStr.substr(0, andPos);string term2 = queryStr.substr(andPos + 5);auto& term1List = invertedIndex[term1];auto& term2List = invertedIndex[term2];// 标记包含每个词的文档vector<bool> hasTerm1(docCount, false), hasTerm2(docCount, false);for (const auto& entry : term1List) hasTerm1[entry.first] = true;for (const auto& entry : term2List) hasTerm2[entry.first] = true;// 找出同时包含两个词的文档vector<int> matchingDocs;for (int docId = 0; docId < docCount; ++docId) {if (hasTerm1[docId] && hasTerm2[docId]) {matchingDocs.push_back(docId);}}// 输出匹配的行bool foundMatch = false;for (size_t i = 0; i < matchingDocs.size(); ++i) {int docId = matchingDocs[i];if (i > 0) cout << "----------\n";// 收集匹配行并去重set<int> matchedLines;for (const auto& entry : term1List) {if (entry.first == docId) {matchedLines.insert(entry.second);}}for (const auto& entry : term2List) {if (entry.first == docId) {matchedLines.insert(entry.second);}}// 输出匹配行for (int lineNum : matchedLines) {cout << documents[docId][lineNum] << '\n';foundMatch = true;}}if (!foundMatch) {cout << "Sorry, I found nothing.\n";}} else if (queryStr.find("OR") != string::npos) {// OR 查询:term1 OR term2size_t orPos = queryStr.find(" OR ");string term1 = queryStr.substr(0, orPos);string term2 = queryStr.substr(orPos + 4);auto& term1List = invertedIndex[term1];auto& term2List = invertedIndex[term2];// 标记包含任一词的文档vector<bool> matchedDocs(docCount, false);for (const auto& entry : term1List) matchedDocs[entry.first] = true;for (const auto& entry : term2List) matchedDocs[entry.first] = true;vector<int> docsInOrder;for (int docId = 0; docId < docCount; ++docId) {if (matchedDocs[docId]) docsInOrder.push_back(docId);}// 输出匹配的行bool foundMatch = false;for (size_t i = 0; i < docsInOrder.size(); ++i) {int docId = docsInOrder[i];if (i > 0) cout << "----------\n";// 收集匹配行并去重set<int> matchedLines;for (const auto& entry : term1List) {if (entry.first == docId) {matchedLines.insert(entry.second);}}for (const auto& entry : term2List) {if (entry.first == docId) {matchedLines.insert(entry.second);}}// 输出匹配行for (int lineNum : matchedLines) {cout << documents[docId][lineNum] << '\n';foundMatch = true;}}if (!foundMatch) {cout << "Sorry, I found nothing.\n";}} else if (queryStr.find("NOT") != string::npos) {// NOT 查询:NOT termstring term = queryStr.substr(4);// 标记包含该词的文档vector<bool> excludeDocs(docCount, false);if (invertedIndex.find(term) != invertedIndex.end()) {for (const auto& entry : invertedIndex[term]) {excludeDocs[entry.first] = true;}}// 输出不包含该词的完整文档bool foundMatch = false;for (int docId = 0; docId < docCount; ++docId) {if (!excludeDocs[docId]) {if (foundMatch) cout << "----------\n";for (const string& line : documents[docId]) {cout << line << '\n';}foundMatch = true;}}if (!foundMatch) {cout << "Sorry, I found nothing.\n";}} else {// 单个词查询string term = queryStr;if (invertedIndex.find(term) == invertedIndex.end() || invertedIndex[term].empty()) {cout << "Sorry, I found nothing.\n";} else {auto& termList = invertedIndex[term];// 去重并排序set<pair<int, int>> uniqueEntries;for (const auto& entry : termList) {uniqueEntries.insert(entry);}// 按文档和行号顺序输出int prevDocId = -1;for (const auto& entry : uniqueEntries) {int docId = entry.first;int lineNum = entry.second;if (docId != prevDocId) {if (prevDocId != -1) cout << "----------\n";prevDocId = docId;}cout << documents[docId][lineNum] << '\n';}}}// 每个查询结果结束标记cout << "==========\n";}return 0;
}
