当前位置：首页 > news >正文

Boost 搜索引擎

news 2025/9/30 7:40:04

boost搜索引擎

一、背景
二、技术栈和项目环境
三、正排索引 vs 倒排索引-搜索引擎具体原理`
- 正排索引
四、编写去除标签与数据清洗的模块 Parser
五、编写建立索引模块index
六、编写搜索引擎模块Seacher.hpp
七、编写http_server模块
日志信息
问题总结

一、背景

像市场上一些搜索引擎，比如百度、搜狗、360搜索等都是大型项目。我们自己做是不可能的。而我们这里做的是站内搜索（搜索数据更垂直，数据量更小）。例如 cplusplus，它就是C++一个搜索引擎。`

二、技术栈和项目环境

技术栈：C/C++，C++11，STL,准标准库Boost，jsoncpp,cppjieba,cpp-httplib，选学：html5,css,js,jquery,Ajax

三、正排索引 vs 倒排索引-搜索引擎具体原理`

正排索引

就是从文档ID找到文档内容（文档里的关键字）
目标文档进行分词（目的：方便建立倒排索引和查找）：
雷军买了四斤小米：雷军/买/四斤/小米

⽂档1：雷军买了四⽄⼩⽶
⽂档2：雷军发布了⼩⽶⼿机

文档id	文档内容
1	雷军买了四斤小米
2	雷军发布了小米手机

停止词：了，的、吗，a, the
倒排索引：根据关键字，找到文档ID的方案

关键字(具有唯一性)	文档id
雷军	1、2
买	1
四斤	1
小米	1、2
发布	2

模拟一次查找过程

用户输入：小米->倒排索引中查找->提取出文档ID(1,2)->根据正排索引->找到文档内容-> title+content(desc)+url文档结果进行摘要->构建响应结果

将boost下doc文档里html文档内容全部拷贝到data下的input文件里，当作数据源

cp -rf boost_1_89_0/doc/html/* data/input

四、编写去除标签与数据清洗的模块 Parser

boost官网：boost.org
//目前只需要boost_1_89_0/doc/html下文件，用它来做索引

去标签之后的数据

syl@syl-virtual-machine:~/桌面/boost_search$ touch Parser.cc

syl@syl-virtual-machine:~/桌面/boost_search/data$ mkdir raw_html
syl@syl-virtual-machine:~/桌面/boost_search/data$ ll
总用量 24
drwxrwxr-x  4 syl syl  4096 8月  30 20:34 ./
drwxrwxr-x  4 syl syl  4096 8月  30 20:26 ../
drwxrwxr-x 56 syl syl 12288 8月  30 20:33 input/  //这里放的是原始文档
drwxrwxr-x  2 syl syl  4096 8月  30 20:34 raw_html/    //这里放的是去标签之后的文档目标:把每个文档去标签，然后写入到同一个文件中！每个文档内容只占“一行” 文档和文档之间用\3区分

编写parser.cc

#include<iostream>
#include<string>
#include<vector>
#include<boost/filesystem.hpp>
#include"util.hpp"
using namespace std;//是一个目录 放的是所有html文件const string src_path="data/input/";  //const string raw="/data/raw_html/raw.txt";typedef struct DocInfo
{string title;// 标题string content;//文档内容string url;//文档的url
}DocInfo_t;bool Enumfile(const string& src_path,vector<string>* file_list);bool ParseHtml(const vector<string>& file_list,vector<DocInfo_t>* result );bool SaveHtml(const vector<DocInfo_t>& result,const string& output);int main()
{vector<string>file_list;//递归式的把每个html文件名带路径保存到file_list种if(!Enumfile(src_path,&file_list))cerr<<"enum file name error!"<<endl;return 1;//第二步按照file_list读取每个文件内容 并进行解析vector<DocInfo_t>result;if(!ParseHtml(file_list,&result)){cerr<<"parse html error!"<<endl;return 2;}//第三步：把解析完毕的各个文件内容，写入outputstring output;if(!SaveHtml(result,output)){cerr<<"save html error!"<<endl;return 3;}
}bool Enumfile(const string& src_path,vector<string>* file_list)
{namespace fs=boost::filesystem;fs::path root_path(src_path); //定义个变量初始化//判断路径是否存在if(!fs::exists(root_path)){cerr<<src_path<<"not exist"<<endl;return false;}//定义一个空的迭代器 用来进行判断递归结束fs::recursive_directory_iterator end;for(fs::recursive_directory_iterator iter(root_path);iter!=end;iter++){//是否是普通文件 html都是普通文件if(!fs::is_regular_file(*iter)){continue;    }if(iter->path().extension()!=".html")continue;cout<<"debug: "<<iter->path().string();//当前路径一定是一个合法的 以.html结束的普通网页文件file_list->push_back(iter->path().string());//将所有带路径的html文件保存到file_list 方便后面分析}//定义一个空的迭代器，用来进行判断递归结束return true;}//* 输出 & 输入
//解析文章标题static bool ParseTitle(const string& file, string* title ){//先找到其实标记位置size_t begin =file.find("<title>");if(begin==string::npos){return false;}size_t end =file.find("</title>");if(end==string::npos){return false;}//接下来让begin移动到c位置 <title>chapter</title>// size_t begin_pos=begin+string("</title>").size();begin+=string("<title>").size();if(begin>end){return false;}*title = file.substr(begin,end-begin);return true;
}//解析文章内容 去标签 并不是吧内容提取出来 而是将标签去掉
static bool ParseContent(const string& file, string* content)
{//小型状态机enum STATUS{LABLE,     //算是标签CONTENT   //文本内容};enum STATUS s=LABLE; //for(auto c:file){switch(c){//只要碰到右标签 当前的标签被处理完毕case LABLE:if(c=='>')  s=CONTENT;break;case CONTENT:if(c=='<')  s=LABLE;//意味着新的标签开始 文本结束else{//不想保存源文件中的回车换行符 \n作为解析之后的文本分隔符if(c=='\n'){c=' ';}content->push_back(c);//插入}break;default:break;}}return true;
}static bool ParseUrl(const string& file_path,string* url)
{string url_head="https://www.boost.org/doc/libs/1_89_0/doc/html";string url_tail=file_path.substr(src_path.size());*url =url_head+url_tail;return true;
}bool ParseHtml(const vector<string>& file_list,vector<DocInfo_t>* result )
{for(const string& file:file_list){//读取文件Read() 到resultsstring results;if(!ns_utill::FileUtil::ReadFile(file,&results)){continue;}DocInfo_t doc;//解析指定文件 提取title 从results解析到title种if(!ParseTitle(results,&doc.title)){continue;}//解析指定文件，提取contentif(!ParseContent(results,&doc.content)){continue;}//解析指定文件路径if(!ParseUrl(results,&doc.url)){continue;}result->push_back(doc);//for Debug//ShowDebug(doc);}return true;}
//网页保存至目标文件中bool SaveHtml(const vector<DocInfo_t>& result,const string& output)
{
#define SEP '\3'ofstream ou(output,ios::out||ios::binary);if(!ou.is_open()){cerr<<"open"<<output<<"error"<<endl;return false;}for(auto& con:result){string out_str;out_str+=con.title;out_str+=SEP;out_str+=con.content;out_str+=SEP;out_str+=con.url;out_str+='\n';}ou.close();return true;}

构建url

官网url：https://www.boost.org/doc/libs/1_89_0/doc/htm/accumulators.htmldata/input/accumulators.html //我们把下载的库copy到我们自己的根目录下面
url_head ="https://www.boost.org/doc/libs/1_89_0/doc/html";
url_tail=[data/input] /accumulators.html-> url_tail=/accumulators.html
url=url_head+url_tail

将解析内容保存到目标文件中

采⽤下⾯的⽅案：
version2: 写⼊⽂件中，⼀定要考虑下⼀次在读取的时候，也要⽅便操作!
类似：title\3content\3url \n title\3content\3url \n title\3content\3url \n ...
⽅便我们getline(ifsream, line)，直接获取⽂档的全部内容：title\3content\3url

bool SaveHtml(const vector<DocInfo_t>& result,const string& output)
{
#define SEP '\3'ofstream ou(output,ios::out||ios::binary);if(!ou.is_open()){cerr<<"open"<<output<<"error"<<endl;`return false;``}``for(auto& con:result)``{``string out_str;``out_str+=con.title;``out_str+=SEP;``out_str+=con.content;``out_str+=SEP;``out_str+=con.url;``out_str+='\n';``}``ou.close();``return true;`
`}`

五、编写建立索引模块index

id	文档内容
1	雷军买了四斤小米
2	雷军发布了小米手机

建立正排的代码

struct DocInfo{string title;string content;string url;uint64_t doc_id;};DocInfo* BuildForwardIndex(const string& line){//解析line 字符串切分 line ->3 string, title,content,urlvector<string> results;const string sep= "\3";ns_utill::StringUtil::CutString(line,&results,sep); //字符串切割if(results.size()!=3){return nullptr;}//字符串进行填充到DocInfoDocInfo doc;doc.title=results[0];doc.content=results[1];//doc.url=results[2];doc.doc_id=forward_index.size();//先保存id，再插入，对应的id就是当前doc在vector中的下标！//插入到正排索引forward_index.push_back(doc);return &forward_index.back();}

建立倒排

  struct InteredElm{uint64_t doc_id;string word;  //关键字int weight;  };//倒排拉链typedef vector<InteredElm*> InteredList;//  倒排索引 一定是一个一一映射关系【关键字和倒排拉链之间的关系】unordered_map<string,InteredList>Intered_index;//词频统计

结构

namespace ns_Index
{struct DocInfo{string title;string content;string url;uint64_t doc_id;};struct InteredElm{uint64_t doc_id;string word;  //关键字int weight;  InteredElm():weight(0){}};typedef std::vector<InvertedElem> InvertedList;class Index{private://正排索引的数据结构⽤数组，数组的下标天然是⽂档的IDstd::vector<DocInfo> forward_index; //正排索引//倒排索引⼀定是⼀个关键字和⼀组(个)InvertedElem对应[关键字和倒排拉链的映射关系]		std::unordered_map<std::string, InvertedList> inverted_index;public:Index(){}~Index(){}public://根据doc_id找到找到⽂档内容DocInfo *GetForwardIndex(uint64_t doc_id){return nullptr;}//根据关键字string，获得倒排拉链InvertedList *GetInvertedList(const std::string &word){return nullptr;}//根据去标签，格式化之后的⽂档，构建正排和倒排索引//data/raw_html/raw.txtbool BuildIndex(const std::string &input) //parse处理完毕的数据交给我{return true;}};
}

index.hpp 代码

#pragma oncenamespace ns_Index
{struct DocInfo{string title;string content;string url;uint64_t doc_id;};struct InteredElm{uint64_t doc_id;string word;  //关键字int weight;  InteredElm():weight(0){}};//倒排拉链typedef vector<InteredElm> InteredList;class Index{public:Index(){};Index(const Index&)=delete;Index& operator=(const Index&)=delete;static mutex loc;static Index* instance;public:~Index(){}//单例static Index* GetInstance(){if(nullptr==instance){loc.lock();if(nullptr==instance){instance=new Index();}loc.unlock();}return instance;}//根据id找到文档内容//正排索引对应的元素DocInfo* GetForwardIndex(uint64_t doc_id){if(doc_id>=forward_index.size()){cerr<<doc_id<<"unfind error!"<<endl;}return &(forward_index[doc_id]);}//根据关键字string获得倒排拉链 根据关键字查找//获取倒排索引元素InteredList* GetInteredList(const string& word) // 有误{auto iter=Intered_index.find(word);if(iter==Intered_index.end()){cerr<<word<<" have no InteredList"<<endl;return nullptr;}return &(iter->second);}//根据去标签，格式化之后的⽂档，构建正排和倒排索引//data/raw_html/raw.txtbool buildIndex(const string& input){std::ifstream in(input,std::ios::in | ios::binary);if(!in.is_open()){cerr<<"sorry"<<input<<"open error"<<endl;return false;}//到这里就把文件打开了int cnt=0;string line;while(std::getline(in,line)){DocInfo* doc=BuildForwardIndex(line);//正排if(nullptr==doc){cerr<<"build error"<<endl;return false;}BuildInveredIndex(*doc);//倒排cnt++;if(cnt%50==0){//cout<<"当前建立索引文档："<<cnt<<endl;LOG(NORMAL,"当前已经建立的索引文档: "+std::to_string(cnt));}}return true;}private://正排索引DocInfo* BuildForwardIndex(const string& line){//解析line 字符串切分 line ->3 string, title,content,urlvector<string> results;const string sep= "\3";ns_utill::StringUtil::Split(line,&results,sep); //字符串切割if(results.size()!=3){std::cout << results.size() << std::endl;std::cout << "line" << line << std::endl;return nullptr;}//字符串进行填充到DocInfoDocInfo doc;doc.title=results[0];doc.content=results[1];//doc.url=results[2];doc.doc_id=forward_index.size();//先保存id，再插入，对应的id就是当前doc在vector中的下标！//插入到正排索引forward_index.push_back(doc);return &forward_index.back();}bool BuildInveredIndex(const DocInfo& doc){//DocInfo{tile,content,url,doc_id}//word->倒排struct word_cnt{int title_cnt;int content_cnt;word_cnt():title_cnt(0),content_cnt(0){}};vector<string>title_words;unordered_map<string,word_cnt>word_map; //存储容器 暂存词频ns_utill::JiebaUtil::CutString(doc.title,&title_words);//存到该容器里//对标题进行解析for(auto& c:title_words){boost::to_lower(c);word_map[c].title_cnt++;}vector<string>content_words;ns_utill::JiebaUtil::CutString(doc.content,&content_words);//对内容进行解析for(auto& s:content_words){boost::to_lower(s);word_map[s].content_cnt++;}
#define X 10#define Y 1//hello HELLO 这一步好比搜索 找相应关联 得出搜索结果for(auto& word_pair:word_map){InteredElm item;item.doc_id=doc.doc_id;item.word=word_pair.first;            item.weight=X*word_pair.second.title_cnt+Y*word_pair.second.content_cnt;//dInteredList& Intered_list=Intered_index[word_pair.first];//k值Intered_list.push_back(move(item));}return true;}private://正排索引 的数据结构用数组 数组的下标是文档的idvector<DocInfo>forward_index;//  倒排索引 一定是一个一一映射关系【关键字和倒排拉链之间的关系】unordered_map<string,InteredList>Intered_index;};Index* Index::instance=nullptr;std::mutex Index::loc;
};

六、编写搜索引擎模块Seacher.hpp

基本结构

namespace ns_searcher{class Searcher{private:ns_index::Index *index; //供系统进⾏查找的索引public:Searcher(){}~Searcher(){}public:void InitSearcher(const std::string &input)12 {//1. 获取或者创建index对象//2. 根据index对象建⽴索引}//query: 搜索关键字//json_string: 返回给⽤⼾浏览器的搜索结果void Search(const std::string &query, std::string *json_string){//1.[分词]:对我们的query进⾏按照searcher的要求进⾏分词//2.[触发]:就是根据分词的各个"词"，进⾏index查找//3.[合并排序]：汇总查找结果，按照相关性(weight)降序排序//4.[构建]:根据查找出来的结果，构建json串 -- jsoncpp}};}

完整代码(Searcher.hpp)

namespace ns_search{//解决搜索结果出现重复文档struct InvertedElmPrint{uint64_t doc_id;int weight;vector<string>words;InvertedElmPrint():doc_id(0),weight(0){}};class Searcher{private:ns_Index::Index* index;public:Searcher(){}void InitSearcher(const string& input){//获取或者创建index对象index=ns_Index::Index::GetInstance();LOG(NORMAL,"获取单例成功");//根据index对象建立索引index->buildIndex(input);// cout<<"建立正排倒排索引成功..."<<endl;LOG(NORMAL,"建立正排倒排索引成功");}void search(const std::string& query,string* json_search){vector<string>words;ns_utill::JiebaUtil::CutString(query,&words);unordered_map<uint64_t,InvertedElmPrint>tokens_map;//2、触发：根据分词的各个词 进行index查找vector<InvertedElmPrint>intered_list_all;for(string word:words){boost::to_lower(word);ns_Index::InteredList *intered_list=index->GetInteredList(word);//if(nullptr==intered_list){continue;}//intered_list_all.insert(intered_list_all.end(),intered_list->begin(),intered_list->end());for(const auto& elm:*intered_list){auto& item=tokens_map[elm.doc_id];//[];如果存在直接获取 如果不存在新建//item 一定是doc_id 相同的print 节点item.doc_id=elm.doc_id;item.weight+=elm.doc_id;item.words.push_back(elm.word);}}for(const auto& its:tokens_map){intered_list_all.push_back(its.second);}//排序 按weight降序排序并去重sort(intered_list_all.begin(),intered_list_all.end(),[](const InvertedElmPrint& e1,const InvertedElmPrint& e2){return e1.weight>e2.weight;});//将所有结果按json串格式返回Json::Value root;for(auto& it:intered_list_all){ns_Index::DocInfo* doc=index->GetForwardIndex(it.doc_id);if(nullptr==doc){continue;}Json::Value elm;elm["title"]=doc->title;elm["content"]=GetDesc(doc->content,it.words[0]);elm["url"]=doc->url;//for dug//elm["weight"]=it.weight;root.append(elm);}Json::StyledWriter writer;*json_search=writer.write(root);}//获取摘要string GetDesc(const string& html_content,const string& word){//找到word在html_content中首次出现 然后往前找50字节（如果没有 从begin找）往后100找const int prev_step=50;const int next_step=100;//找到首次出现位置auto iter=std::search(html_content.begin(),html_content.end(),word.begin(),word.end(),[](int x,int y){return (tolower(x)==tolower(y));});if(iter==html_content.end()){return "None1";}int pos=std::distance(html_content.begin(),iter);//获取start endint start=0;int end=html_content.size()-1;//如果有50位置 就更新开始位置if(pos>start+prev_step) start=pos-prev_step;if(pos<end-next_step)   end=pos+next_step;//3、截取子串 returnif(start>=end)  return "None2";    string desc=html_content.substr(start,end-start);return desc;}};};

安装jsoncpp

sudo apt install -y jsoncpp_devel

七、编写http_server模块

cpp-httplib库安装路径：https://gitee.com/zhangkt1995/cpp-httplib?_from=gitee_search

const std::string root_path="./wwwroot";
const std::string input="data/raw_html/raw.txt";int main()
{ns_search::Searcher sea;sea.InitSearcher(input);httplib::Server sur;sur.set_base_dir(root_path.c_str());sur.Get("/s",[&sea](const httplib::Request& req,httplib::Response& res){if(!req.has_param("word")){res.set_content("必须要搜索的关键词","text/plain: charset=utf-8");return;}std::string word=req.get_param_value("word");LOG(NORMAL,"用户在搜索："+word);//std::cout<<"用户在搜索"<<word<<std::endl;std::string json_string;sea.search(word,&json_string);res.set_content(json_string,"application/json");});// sur.Get("/hi",[](const httplib::Request& req,httplib::Response& res){//     res.set_content("年后再说","text/plain");// });LOG(NORMAL,"服务器启动成功....");sur.listen("0.0.0.0",8081);return 0;}

日志信息

#pragma once
#include<iostream>
#include<ctime>
#include<string>#define NORMAL 1
#define WARNNING 2
#define DEBUG 3
#define FATAL 4#define LOG(LEVEL,MESSAGE) log(#LEVEL,MESSAGE,__FILE__,__LINE__)
void log(std::string level,std::string message,std::string file,int line){std::cout<<"["<<level<<"]"<<"["<<time(nullptr)<<"]"<<"["<<message<<"]"<<"["<<file<<":"<<line<<"]"<<std::endl;}

问题总结

遇到的问题：
在进行综合调试的时候（debug.cc），发现什么也搜不到，不管搜索什么都是空的。
再进行一一检查后最终发现Parser.cc文件出错了。content是空的 url 里面混杂着content内容。在ParserHtml文件中有一处解析文件路径一个参数传错了，错吧file传成results。开始索引建立有问题顺着searcher.cc文件里search函数摸索过去。再后面调试的时候发现search里面将所有结果按照json格式返回中GetForwardIndex里的参数传错。

查看全文

http://www.dtcms.com/a/422737.html