我想实现一个实时文件搜索网页,输入要搜索的文件关键字,后台不断爬取文件,并同时边把爬取的文件实时展示在网页上,比如“视频”,就会把所有视频文件的链接和文件名和来源页链接实时展示出来,点链接可以跳转,只要不关闭网页,就一直爬取显示。

第一步:演示
from flask import Flask, render_template_string, request, Response
import threading
import time
import json
import requests
from bs4 import BeautifulSoupapp = Flask(__name__)
is_crawling = False
crawl_thread = None
results_stream = []
HTML_TEMPLATE = '''
<!DOCTYPE html>
<html lang="zh">
<head><meta charset="UTF-8"><title>实时文件搜索引擎</title><style>body { font-family: Arial, sans-serif; margin: 40px; }input[type=text] { width: 300px; padding: 10px; font-size: 16px; }button { padding: 10px 20px; font-size: 16px; margin-left: 10px; }.file-item { border-bottom: 1px solid #eee; padding: 15px 0; background: #f9f9f9; border-radius: 8px; margin: 10px 0; transition: all 0.3s;}.file-item:hover { transform: translateY(-2px); box-shadow: 0 4px 8px rgba(0,0,0,0.1); }.filename { color: #1a0dab; font-weight: bold; font-size: 1.1em; }.source { color: #006621; font-size: 0.9em; margin: 5px 0; }.link { color: #0066cc; text-decoration: none; word-break: break-all; display: block; margin: 8px 0; font-size: 0.95em;}.timestamp { color: #666; font-size: 0.8em; }#status { margin: 20px 0; padding: 10px; background: #e6f7ff; border-radius: 4px; }</style>
</head>
<body><h1>🔍 实时文件搜索引擎</h1><div><input type="text" id="keyword" placeholder="请输入要搜索的文件类型,例如:视频、文档、PDF..." value="视频"><button onclick="startSearch()">开始搜索</button><button onclick="stopSearch()">停止搜索</button></div><div id="status">等待用户操作...</div><hr><div id="results"></div><script>let eventSource = null;function startSearch() {const keyword = document.getElementById('keyword').value;if (!keyword) return alert("请输入关键词!");// 关闭旧连接if (eventSource) eventSource.close();// 更新状态document.getElementById('status').innerHTML = `正在搜索 '${keyword}'... <span style="color:green">● 连接中</span>`;document.getElementById('results').innerHTML = '';// 创建SSE连接eventSource = new EventSource('/stream?keyword=' + encodeURIComponent(keyword));eventSource.onmessage = function(event) {const data = JSON.parse(event.data);const resultsDiv = document.getElementById('results');const item = document.createElement('div');item.className = 'file-item';item.innerHTML = `<div class="filename">${data.filename}</div><div class="source">来源页: ${data.source}</div><a href="${data.url}" target="_blank" class="link">${data.url}</a><div class="timestamp">发现时间: ${data.timestamp}</div>`;resultsDiv.prepend(item); // 最新结果在最上面};eventSource.onerror = function() {document.getElementById('status').innerHTML += " ● 连接异常";};// 开始爬取fetch('/start?keyword=' + encodeURIComponent(keyword));}function stopSearch() {if (eventSource) {eventSource.close();document.getElementById('status').innerHTML += " <span style='color:orange'>● 已手动断开</span>";}fetch('/stop');}</script>
</body>
</html>
'''@app.route('/')
def index():return render_template_string(HTML_TEMPLATE)@app.route('/start')
def start_crawling():global is_crawling, crawl_threadkeyword = request.args.get('keyword', '视频')if not is_crawling:is_crawling = Truecrawl_thread = threading.Thread(target=crawl_task, args=(keyword,))crawl_thread.start()return "爬取已启动"else:return "已在爬取中..."@app.route('/stop')
def stop_crawling():global is_crawlingis_crawling = Falsereturn "已发送停止信号"@app.route('/stream')
def stream():"""SSE端点,实时推送结果"""def generate():while True:if len(results_stream) > 0:result = results_stream.pop(0)yield 'data: {}\n\n'.format(json.dumps(result))else:time.sleep(0.1) return Response(generate(), mimetype='text/event-stream')def crawl_task(keyword):"""模拟爬取任务"""global is_crawlingmock_urls = [("教学视频.mp4", "https://example.com/video1.mp4", "https://example.com/page1"),("宣传片_高清.avi", "https://mirror-site.org/vid.avi", "https://example.com/page2"),("讲座回放.flv", "https://cdn-host.net/flv/lecture.flv", "https://edu-site.com/lectures"),("培训资料.mkv", "https://training-site.com/materials.mkv", "https://training-site.com/courses"),("演示文稿.mov", "https://media-host.com/demo.mov", "https://corp-site.com/media")]while is_crawling:for filename, link, source in mock_urls:if not is_crawling:breakresult_item = {"filename": f"[{keyword}]{filename}","url": link,"source": source,"timestamp": time.strftime("%H:%M:%S")}print(f"找到文件并推送: {result_item}")results_stream.append(result_item) time.sleep(3)print("爬取结束")if __name__ == '__main__':app.run(port=5000, debug=True, use_reloader=False)