通过条形码获取对应商品信息python程序
承接上篇:通过条形码图片获取对应商品信息python程序
由于上篇采用图片来获取条形码,然后再通过条形码网站爬取信息。这个过程中使用opencv识别图片中的条形码,效率和准确率都会存在一定问题,因此后面通过查询和测试,想到了一个更好获取条形码的便捷方式:通过条形码扫描枪,把商品条形码先存放在excel中,然后在爬取商品信息。
当然你可以边扫描条形码边爬取商品信息,由于频繁访问网站会出现人工校验,这样就会多次扫描,比较烦人,所以后面采用了,一次性把所有商品条形码一次性全部录入excel中,然后再爬取信息。
1、util脚本
# -*- coding: utf-8 -*-
import sys, os
import tracebackimport requests
from bs4 import BeautifulSoup
import openpyxl
from openpyxl import load_workbook
from openpyxl.styles import Font, Alignmentfrom pbbs import *def read_all_xlsx_sheets(file_path, is_print=False):"""用openpyxl获取工作表名称并读取全部数据(逐行读取,支持精细控制):param file_path: XLSX文件路径:return: 字典,key=工作表名称,value=该工作表的数据(列表嵌套列表,第一行为列名)"""# 1. 加载XLSX文件wb = load_workbook(file_path, read_only=False, data_only=True)sheet_names = wb.sheetnamesprint(f"XLSX文件包含的工作表:{sheet_names}")# 2. 逐工作表、逐行读取数据all_sheet_data = {}for sheet_name in sheet_names:ws = wb[sheet_name] # 获取当前工作表对象sheet_data = [] # 存储当前工作表的所有行数据# 读取所有有数据的行(ws.max_row:工作表的最大行数)max_row = ws.max_rowmax_column = ws.max_columnfor row in range(1, max_row + 1):row_data = []# 读取当前行的所有有数据的列(ws.max_column:工作表的最大列数)for col in range(1, max_column + 1):# 获取单元格的值(None表示空单元格)cell_value = ws.cell(row=row, column=col).valuerow_data.append(cell_value)# 跳过空行(可选)if any(cell is not None for cell in row_data):dtnum = len(row_data)if dtnum == 1:row_data = row_data[0]sheet_data.append(row_data)# 存储到字典all_sheet_data[sheet_name] = sheet_dataif is_print:print(f"\n=== 工作表 '{sheet_name}' 的数据概况 ===")print(f"数据行数:{len(sheet_data)},数据列数:{len(sheet_data[0]) if sheet_data else 0}")print("前3行数据:")for i, row in enumerate(sheet_data[:3]):print(f"第{i + 1}行:{row}")wb.close() # 关闭工作簿,释放资源return all_sheet_datadef get_prod_infos_68api(barcode):skip_clss = ('通用名称', '商品分类', '净含量', '上市时间', '参考价格',)#https://68api.com/barcode/6921361202633url = f'https://68api.com/barcode/{barcode}'print(url)html = requests.get(url)html.encoding = 'utf-8'soup = BeautifulSoup(html.text, 'lxml')bar_1_right = soup.find('div', class_='bar-1-right')strong = bar_1_right.find('strong')trade_name = strong.textbar_1_list = soup.find('div', class_='bar-1-list')infos = {'药名': trade_name}for child in bar_1_list.children:text = child.texttexts = text.strip().split('\n')if texts[0]:if texts[0] in skip_clss:continuetry:infos[texts[0]] = texts[1]except:infos[texts[0]] = ''# print(texts)bar_2_2_2_1 = soup.find('div', class_='bar-2-2-2-1')company = bar_2_2_2_1.textinfos['公司名'] = companyprint(infos)return infos
2、工程主脚本
import time
import shutil,json
from util import *def cpfile(file,cp_file_dst):shutil.copyfile(file, cp_file_dst)def writejson(data, file):with open(file, 'w', encoding='utf-8') as fp:json.dump(data, fp, ensure_ascii=False, indent=4) #def bar_code2prod_info_xls(fxls):prod_infos_dct = {}error_barcodes_dct = {}all_xlsx_sheet_dts = read_all_xlsx_sheets(fxls)sheet_names = list(all_xlsx_sheet_dts.keys())print(sheet_names)for sheet_name in sheet_names:error_barcodes = []prod_infos = []for barcode in all_xlsx_sheet_dts[sheet_name][:1]:print(sheet_name, barcode)try:prod_info = get_prod_infos_68api(barcode)prod_infos.append(prod_info)except:traceback.print_exc()error_barcodes.append(barcode)time.sleep(30)tprod_infos = lst_dct2lst(prod_infos)prod_infos_dct[sheet_name] = tprod_infoserror_barcodes_dct[sheet_name] = error_barcodesdtdir = pathsplit(fxls, 0)resdir = pathjoin(dtdir, 'resinofs', ismkdir=True)xlsxfile = a2bfilex(fxls, extname='.xlsx', bdir=resdir, suffix='-prodinfos', mode=1)write_dict_to_xlsx(prod_infos_dct, xlsxfile)xlsxfile = a2bfilex(fxls, extname='.xlsx', bdir=resdir, suffix='-request_error_barcodes', mode=1)write_dict_to_xlsx(error_barcodes_dct, xlsxfile)def bar_code2prod_info_xls_by_cls(fxls, sheet_names, mresdir=None, resdir=None):all_xlsx_sheet_dts = read_all_xlsx_sheets(fxls)print(sheet_names)for sheet_name in sheet_names:if isntnone(mresdir):resdir = pathjoin(mresdir, sheet_name, ismkdir=True)error_barcodes = []prod_infos = []barcodes = all_xlsx_sheet_dts[sheet_name][:]barcodenum = len(barcodes)for i, barcode in enumerate(barcodes):msg = f'{i + 1}/{barcodenum},{sheet_name},{barcode}'print(msg)try:prod_info = get_prod_infos_68api(barcode)prod_infos.append(prod_info)except:traceback.print_exc()error_barcodes.append(barcode)# time.sleep(10)try:tprod_infos = lst_dct2lst(prod_infos)xlsname = f'{sheet_name}.xlsx'resfile = pathjoin(resdir, xlsname)if pathexist(resfile):exist_all_xlsx_sheet_dts = read_all_xlsx_sheets(resfile)exist_tprod_infos = exist_all_xlsx_sheet_dts[sheet_name]if tprod_infos:exist_tprod_infos.extend(tprod_infos[1:])tprod_infos = exist_tprod_infosxlsname = f'{sheet_name}-bk.xlsx'resfile_bk = pathjoin(resdir, xlsname)cpfile(resfile, resfile_bk)writexls(tprod_infos, resfile, itms=None, sr=1, sc=1, title=sheet_name)if error_barcodes:xlsname = f'{sheet_name}-request_error.xlsx'resfile = pathjoin(resdir, xlsname)if pathexist(resfile):xlsname = f'{sheet_name}-request_error-bk.xlsx'resfile_bk = pathjoin(resdir, xlsname)cpfile(resfile, resfile_bk)writexls(error_barcodes, resfile, itms=None, sr=1, sc=1, title=sheet_name)print(f'error_barcodes_num={len(error_barcodes)}')except:printexc()if prod_infos:jsonname = f'{sheet_name}.json'resfile = pathjoin(resdir, jsonname)writejson(prod_infos, resfile)if error_barcodes:jsonname = f'{sheet_name}-request_error.json'resfile = pathjoin(resdir, jsonname)writejson(error_barcodes, resfile)def bar_code2prod_info_xls_by_clsT():cls = ['清热解毒', '消化系统', '抗过敏','抗生素']print(f'clsnum={len(cls)}')is_raw_proc = 0sheet_name = '左边柜子'mresdir = r'\data\resinofs'if is_raw_proc:fxls = r'\data\药品条形码库-val.xlsx'sheet_names = [sheet_name]bar_code2prod_info_xls_by_cls(fxls, sheet_names, mresdir=mresdir)else:# 单个处理resdir = pathjoin(mresdir, sheet_name)fxlsname = f'{sheet_name}-request_error.xlsx'fxls = pathjoin(resdir, fxlsname)sheet_names = [sheet_name]bar_code2prod_info_xls_by_cls(fxls, sheet_names, resdir=resdir)if __name__ == "__main__":bar_code2prod_info_xls_by_clsT()
