当前位置：首页 > news >正文

VVIC 关键字搜索接口开发：快时尚场景下的智能分词与爆款优先排序实现

news 2025/10/24 10:56:51

一、接口定位与快时尚搜索的技术特殊性

VVIC（搜款网）作为聚焦服装快时尚的批发平台，其关键字搜索接口与传统电商存在本质差异：需解决 "风格词模糊匹配"（如 "韩系慵懒风"）、"元素组合搜索"（如 "碎花 + 泡泡袖 + 连衣裙"）、"批发场景过滤"（如 "10 件起批 + 现货"）等特殊需求。普通搜索接口的精确匹配逻辑完全不适用于快时尚采购场景。

本文方案区别于网络上的基础爬虫实现，核心突破点在于：

构建快时尚专属分词系统（支持风格词、元素词、场景词的多维度拆分）
开发爆款因子加权算法（将销量增长率、收藏趋势等转化为搜索排序权重）
实现批发属性过滤引擎（精准识别 "混批"" 现货 ""定制" 等采购场景关键词）

二、核心技术方案与搜索维度设计

1. 快时尚搜索专属数据维度

搜索维度	核心字段	技术处理方式
基础属性	商品 ID、标题、主图、价格	常规索引存储
风格标签	韩系、法式、通勤、复古等	构建风格词库，实现模糊匹配
元素标签	碎花、蕾丝、oversize 等	支持多元素组合搜索（+ 号连接）
批发属性	起订量、现货率、混批支持	解析文本中的数字与规则描述
爆款因子	7 天销量、增长率、收藏数	量化为 0-100 分的排序权重
供应商属性	档口位置、上新频率、爆款率	作为筛选条件与二次排序依据

2. 差异化搜索流程设计

用户输入关键词

分词处理

基础词拆分

风格词识别

元素词提取

场景词解析

C&D&E&F

构建多维度查询

基础结果召回

爆款因子加权排序

批发属性过滤

结果返回

点击获取key和secret

三、核心代码实现：从分词到排序的全链路

VVIC关键字搜索接口实现，含快时尚分词与爆款排序

import time
import json
import logging
import random
import re
import hashlib
from typing import Dict, List, Tuple, Optional
from datetime import datetime, timedelta
import requests
import redis
import numpy as np
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from sklearn.feature_extraction.text import TfidfVectorizer# 配置日志
logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)class VVICSearchEngine:def __init__(self, redis_host: str = 'localhost', redis_port: int = 6379,proxy_pool: List[str] = None, cache_ttl: int = 300):"""VVIC快时尚搜索引擎，支持风格分词与爆款排序:param redis_host: Redis主机地址:param redis_port: Redis端口:param proxy_pool: 代理IP池:param cache_ttl: 缓存过期时间(秒)"""# 初始化Redis连接self.redis = redis.Redis(host=redis_host, port=redis_port, db=14)# VVIC搜索基础配置self.base_url = "https://www.vvic.com"self.search_api = "https://www.vvic.com/search/ajaxsearch"# 初始化会话self.session = self._init_session()# 代理池（优先服装产业带节点）self.proxy_pool = proxy_pool or []# 缓存配置self.cache_ttl = cache_ttl# 用户代理生成器self.ua = UserAgent()# 快时尚词库（核心差异化组件）self.fashion_vocab = self._build_fashion_vocab()# 搜索反爬配置self.anti_crawl = {"request_delay": (1.5, 3.5),  # 请求延迟"header_rotation": True,      # 头信息轮换"session_reset_interval": 15, # 会话重置间隔"query_params_shuffle": True  # 参数顺序随机化}# 请求计数器self.request_count = 0def _init_session(self) -> requests.Session:"""初始化请求会话"""session = requests.Session()session.headers.update({"Accept": "application/json, text/javascript, */*; q=0.01","Accept-Language": "zh-CN,zh;q=0.9","X-Requested-With": "XMLHttpRequest","Connection": "keep-alive"})return sessiondef _build_fashion_vocab(self) -> Dict:"""构建快时尚专属词库（核心组件）"""return {# 风格词库（含同义词）"style": {"韩系": ["韩版", "韩式", "韩国风"],"法式": ["法国风", "法式复古"],"通勤": ["职场", "办公室", "职业装"],"复古": ["复古风", "做旧", "vintage"],"甜美": ["可爱", "甜妹", "少女风"]},# 元素词库"element": ["碎花", "格纹", "条纹", "波点", "蕾丝", "刺绣", "绑带", "荷叶边", "oversize", "短款","修身", "宽松", "收腰", "露腰", "泡泡袖"],# 品类词库"category": ["连衣裙", "上衣", "T恤", "衬衫", "裤子","外套", "卫衣", "毛衣", "短裙", "长裙"],# 场景词库（批发属性）"scene": {"现货": ["现", "现货秒发", "有货"],"混批": ["混拿", "混款", "可混批"],"定制": ["定做", "定制logo", "来样定制"],"小批量": ["少量", "几件起批", "10件以下"]}}def _fashion_word_segment(self, query: str) -> Dict:"""快时尚专属分词（核心功能）:param query: 搜索关键词:return: 分维度分词结果"""result = {"original": query,"base_words": [],         # 基础词"style_words": [],        # 风格词"element_words": [],      # 元素词"category_words": [],     # 品类词"scene_words": [],        # 场景词"numeric_filters": {}     # 数字过滤条件（如价格、起订量）}# 1. 提取数字过滤条件（价格、起订量）# 价格范围（如"100-200元"）price_match = re.search(r'(\d+)-(\d+)元', query)if price_match:result["numeric_filters"]["price_min"] = int(price_match.group(1))result["numeric_filters"]["price_max"] = int(price_match.group(2))query = re.sub(r'\d+-\d+元', '', query)# 起订量（如"5件起"）moq_match = re.search(r'(\d+)件起', query)if moq_match:result["numeric_filters"]["min_order"] = int(moq_match.group(1))query = re.sub(r'\d+件起', '', query)# 2. 识别场景词（现货、混批等）for scene, synonyms in self.fashion_vocab["scene"].items():if scene in query or any(syn in query for syn in synonyms):result["scene_words"].append(scene)# 从查询中移除已识别的场景词query = re.sub(scene, '', query)for syn in synonyms:query = re.sub(syn, '', query)# 3. 识别风格词for style, synonyms in self.fashion_vocab["style"].items():if style in query or any(syn in query for syn in synonyms):result["style_words"].append(style)# 从查询中移除已识别的风格词query = re.sub(style, '', query)for syn in synonyms:query = re.sub(syn, '', query)# 4. 识别元素词for element in self.fashion_vocab["element"]:if element in query:result["element_words"].append(element)query = re.sub(element, '', query)# 5. 识别品类词for category in self.fashion_vocab["category"]:if category in query:result["category_words"].append(category)query = re.sub(category, '', query)# 6. 剩余部分作为基础词result["base_words"] = [word for word in re.split(r'\W+', query) if word.strip()]return resultdef _rotate_headers(self) -> None:"""轮换请求头，针对快时尚平台优化"""self.session.headers["User-Agent"] = self.ua.randomself.session.headers["Referer"] = f"{self.base_url}/search"# 动态添加/移除部分头信息optional_headers = {"Accept-Encoding": "gzip, deflate, br","Sec-Fetch-Dest": "empty","Sec-Fetch-Mode": "cors","Sec-Fetch-Site": "same-origin"}for key, value in optional_headers.items():if random.random() > 0.3:self.session.headers[key] = valueelse:self.session.headers.pop(key, None)def _get_proxy(self) -> Optional[Dict]:"""获取随机代理，优先产业带节点"""if self.proxy_pool and len(self.proxy_pool) > 0:proxy = random.choice(self.proxy_pool)return {"http": proxy, "https": proxy}return Nonedef _reset_session(self) -> None:"""重置会话规避反爬"""self.session = self._init_session()self._rotate_headers()logger.info("已重置VVIC搜索会话")def _anti_crawl_measures(self) -> None:"""执行反爬措施"""# 随机延迟delay = random.uniform(*self.anti_crawl["request_delay"])time.sleep(delay)# 轮换请求头if self.anti_crawl["header_rotation"]:self._rotate_headers()# 定期重置会话self.request_count += 1if self.request_count % self.anti_crawl["session_reset_interval"] == 0:self._reset_session()def _shuffle_query_params(self, params: Dict) -> List[Tuple]:"""随机化参数顺序，增强反爬能力"""params_list = list(params.items())random.shuffle(params_list)return params_listdef _calculate_bestseller_score(self, item: Dict) -> float:"""计算爆款因子得分（0-100）:param item: 商品数据:return: 爆款得分"""# 基础销量得分（0-30分）sales = item.get("sales", 0)sales_score = min(30, max(0, sales / 10))  # 100件得30分# 销量增长率得分（0-25分）growth_rate = item.get("growth_rate", 0)growth_score = min(25, max(0, growth_rate / 4))  # 100%增长率得25分# 收藏数得分（0-20分）favorites = item.get("favorites", 0)favorites_score = min(20, max(0, favorites / 5))  # 100收藏得20分# 现货率得分（0-15分）stock_rate = item.get("stock_rate", 0)stock_score = stock_rate * 15  # 100%现货得15分# 供应商爆款率得分（0-10分）supplier_bestseller = item.get("supplier_bestseller_rate", 0)supplier_score = supplier_bestseller * 10  # 100%爆款率得10分# 总分total_score = sales_score + growth_score + favorites_score + stock_score + supplier_scorereturn round(total_score, 1)def _filter_by_scene(self, items: List[Dict], scene_words: List[str]) -> List[Dict]:"""根据场景词过滤商品:param items: 商品列表:param scene_words: 场景词列表:return: 过滤后的商品列表"""if not scene_words:return itemsfiltered = []for item in items:match = True# 现货过滤if "现货" in scene_words:if item.get("stock_rate", 0) < 0.8:  # 80%以上现货才算匹配match = False# 混批过滤if "混批" in scene_words and match:if not item.get("support_mixed_batch", False):match = False# 定制过滤if "定制" in scene_words and match:if not item.get("support_custom", False):match = False# 小批量过滤if "小批量" in scene_words and match:if item.get("min_order", 10) > 10:  # 10件以下才算小批量match = Falseif match:filtered.append(item)return filtereddef _filter_by_numeric(self, items: List[Dict], numeric_filters: Dict) -> List[Dict]:"""根据数字条件过滤商品（价格、起订量）"""if not numeric_filters:return itemsfiltered = []for item in items:match = True# 价格过滤if "price_min" in numeric_filters and match:if item.get("price", 0) < numeric_filters["price_min"]:match = Falseif "price_max" in numeric_filters and match:if item.get("price", 0) > numeric_filters["price_max"]:match = False# 起订量过滤if "min_order" in numeric_filters and match:if item.get("min_order", 0) > numeric_filters["min_order"]:match = Falseif match:filtered.append(item)return filtereddef search(self, query: str, page: int = 1, page_size: int = 20, sort_by: str = "bestseller", ascending: bool = False) -> Dict:"""执行快时尚搜索:param query: 搜索关键词:param page: 页码:param page_size: 每页数量:param sort_by: 排序方式（bestseller/price/sales）:param ascending: 是否升序:return: 搜索结果"""start_time = time.time()result = {"query": query,"page": page,"page_size": page_size,"total": 0,"items": [],"time_ms": 0,"status": "success"}# 生成缓存键cache_key = f"vvic:search:{query}:{page}:{page_size}:{sort_by}:{ascending}"cache_key = hashlib.md5(cache_key.encode()).hexdigest()# 尝试从缓存获取cached_data = self.redis.get(cache_key)if cached_data:try:cached_result = json.loads(cached_data.decode())cached_result["from_cache"] = Truereturn cached_resultexcept Exception as e:logger.warning(f"缓存解析失败: {str(e)}")try:# 1. 快时尚分词处理word_seg = self._fashion_word_segment(query)result["word_segment"] = word_seg# 2. 构建搜索参数search_params = {"k": query,"page": page,"size": page_size,"t": int(time.time() * 1000)}# 3. 执行搜索请求self._anti_crawl_measures()# 随机化参数顺序增强反爬params_list = self._shuffle_query_params(search_params) if self.anti_crawl["query_params_shuffle"] else list(search_params.items())response = self.session.get(self.search_api,params=params_list,proxies=self._get_proxy(),timeout=15)if response.status_code != 200:result["status"] = "error"result["error"] = f"搜索请求失败，状态码: {response.status_code}"return result# 4. 解析原始搜索结果raw_data = response.json()if raw_data.get("code") != 0:result["status"] = "error"result["error"] = f"接口返回错误: {raw_data.get('msg', '未知错误')}"return resultraw_items = raw_data.get("data", {}).get("items", [])result["total"] = raw_data.get("data", {}).get("total", 0)# 5. 处理商品数据processed_items = []for item in raw_items:processed = {"product_id": item.get("id", ""),"title": item.get("title", ""),"main_image": item.get("img", ""),"price": float(item.get("price", 0)),"original_price": float(item.get("original_price", 0)),"sales": int(item.get("sales", 0)),"growth_rate": float(item.get("growth_rate", 0)),  # 7天增长率"favorites": int(item.get("favorites", 0)),"stock_rate": float(item.get("stock_rate", 0)),"min_order": int(item.get("min_order", 1)),"support_mixed_batch": item.get("support_mixed_batch", False),"support_custom": item.get("support_custom", False),"supplier_id": item.get("supplier_id", ""),"supplier_bestseller_rate": float(item.get("supplier_bestseller_rate", 0)),"style_tags": item.get("style_tags", []),"element_tags": item.get("element_tags", [])}# 计算爆款得分processed["bestseller_score"] = self._calculate_bestseller_score(processed)processed_items.append(processed)# 6. 应用场景过滤filtered_items = self._filter_by_scene(processed_items, word_seg["scene_words"])# 7. 应用数字过滤filtered_items = self._filter_by_numeric(filtered_items, word_seg["numeric_filters"])# 8. 排序处理if sort_by == "bestseller":filtered_items.sort(key=lambda x: x["bestseller_score"], reverse=not ascending)elif sort_by == "price":filtered_items.sort(key=lambda x: x["price"], reverse=not ascending)elif sort_by == "sales":filtered_items.sort(key=lambda x: x["sales"], reverse=not ascending)# 9. 结果分页（过滤后可能需要重新分页）start_idx = (page - 1) * page_sizeend_idx = start_idx + page_sizeresult["items"] = filtered_items[start_idx:end_idx]result["filtered_total"] = len(filtered_items)  # 过滤后的总数量# 10. 缓存结果self.redis.setex(cache_key,timedelta(seconds=self.cache_ttl),json.dumps(result, ensure_ascii=False))except Exception as e:result["status"] = "error"result["error"] = f"搜索处理失败: {str(e)}"# 计算响应时间result["time_ms"] = int((time.time() - start_time) * 1000)return result# 使用示例
if __name__ == "__main__":# 初始化搜索引擎proxy_pool = [# "http://127.0.0.1:7890",  # 替换为实际代理]search_engine = VVICSearchEngine(redis_host="localhost",redis_port=6379,proxy_pool=proxy_pool,cache_ttl=300  # 5分钟缓存)try:# 快时尚搜索示例1：多元素组合搜索query1 = "韩系 碎花 连衣裙 100-200元 现货"print(f"===== 搜索: {query1} =====")result1 = search_engine.search(query=query1,page=1,page_size=10,sort_by="bestseller")if result1["status"] == "error":print(f"搜索失败: {result1['error']}")else:print(f"分词结果: {json.dumps(result1['word_segment'], ensure_ascii=False)}")print(f"总结果数: {result1['total']}, 过滤后: {result1['filtered_total']}")print("前3条结果:")for i, item in enumerate(result1["items"][:3]):print(f"{i+1}. {item['title']} | 价格: ¥{item['price']} | 爆款得分: {item['bestseller_score']} | 起订量: {item['min_order']}件")# 快时尚搜索示例2：风格+场景搜索query2 = "法式 泡泡袖 上衣 混批 小批量"print(f"\n===== 搜索: {query2} =====")result2 = search_engine.search(query=query2,page=1,page_size=10,sort_by="price",ascending=True)if result2["status"] == "error":print(f"搜索失败: {result2['error']}")else:print(f"总结果数: {result2['total']}, 过滤后: {result2['filtered_total']}")print("前3条结果:")for i, item in enumerate(result2["items"][:3]):print(f"{i+1}. {item['title']} | 价格: ¥{item['price']} | 爆款得分: {item['bestseller_score']} | 混批支持: {item['support_mixed_batch']}")except Exception as e:print(f"执行出错: {str(e)}")

四、核心技术模块解析

1. 快时尚专属分词系统

传统分词工具无法识别 "韩系慵懒风" 这类行业术语，本方案构建三级分词机制：

基础词拆分：提取核心商品名称（如 "连衣裙"）
专业词识别：通过内置词库匹配风格（韩系 / 法式）、元素（碎花 / 蕾丝）等专业术语
场景词解析：识别 "现货"" 混批 " 等采购场景词，转化为过滤条件
数字条件提取：自动解析 "100-200 元""5 件起 " 等量化需求

代码中_fashion_word_segment方法实现了这一逻辑，通过多轮正则匹配与词库比对，将模糊的自然语言查询转化为结构化的搜索条件。

2. 爆款因子加权排序算法

区别于普通电商的销量优先排序，设计五维评估模型：

plaintext

爆款得分 = 销量得分(30%) + 增长率得分(25%) + 收藏得分(20%) + 现货率得分(15%) + 供应商爆款率得分(10%)

销量得分：基于近 7 天销量计算，100 件得满分
增长率得分：突出上升趋势明显的潜力商品
现货率得分：快时尚采购对现货需求高，优先展示现货充足商品
供应商爆款率：优先推荐历史爆款率高的供应商商品

代码中_calculate_bestseller_score方法实现了这一计算逻辑，确保搜索结果既包含当前热销款，也挖掘有潜力的上升款。

3. 场景化过滤引擎

将 "现货"" 混批 " 等抽象场景词转化为可执行的过滤规则：

现货过滤：仅保留现货率≥80% 的商品
混批过滤：筛选支持混款采购的商品
小批量过滤：限制最小起订量≤10 件
价格过滤：提取 "100-200 元" 中的区间条件

代码中_filter_by_scene和_filter_by_numeric方法实现了这一功能，使搜索结果更贴合服装批发的实际场景需求。

五、与传统搜索方案的差异对比

特性	传统搜索方案	本方案
分词方式	通用分词，无法识别行业术语	快时尚专属词库，支持风格 / 元素识别
排序逻辑	按销量 / 价格简单排序	多维度爆款因子加权排序
场景支持	不支持批发场景过滤	原生支持 "现货"" 混批 " 等场景词
反爬策略	基础 UA 伪装	包含会话重置、参数随机化等高级策略
数据处理	仅返回原始结果	增加爆款得分、过滤后数量等增值数据