eyoucms 如何采集文章呢?基于php的采集样例
目标站:https://www.chunxia.tw/products.html
采集成果:https://lenao75.com/
需求:采集目标站点的所有旅游线路介绍
数据结构,采用eyoucms的自建模型+文章表+用户操作日志表+上传图片记录表
数据表:
CREATE TABLE `lenao75_com`.`ey_uploads_type` (`id` int(11) NOT NULL AUTO_INCREMENT,`upload_type` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL DEFAULT '' COMMENT '名称',`add_time` int(10) NOT NULL DEFAULT 0 COMMENT '新增时间',`update_time` int(10) NOT NULL DEFAULT 0 COMMENT '更新时间',PRIMARY KEY (`id`) USING BTREE,UNIQUE INDEX `id`(`id`) USING BTREE
) ENGINE = MyISAM AUTO_INCREMENT = 3 CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '上传分组表' ROW_FORMAT = Dynamic;CREATE TABLE `lenao75_com`.`ey_uploads` (`img_id` mediumint(8) UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '自增ID',`type_id` int(10) NOT NULL DEFAULT 0 COMMENT '分组ID',`aid` mediumint(8) UNSIGNED NOT NULL DEFAULT 0 COMMENT '文档ID',`title` varchar(500) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '' COMMENT '文档标题',`image_url` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '' COMMENT '文件存储路径',`intro` varchar(500) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '' COMMENT '图集描述',`width` int(11) NULL DEFAULT 0 COMMENT '图片宽度',`height` int(11) NULL DEFAULT 0 COMMENT '图片高度',`filesize` int(11) UNSIGNED NULL DEFAULT 0 COMMENT '文件大小',`mime` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '' COMMENT '图片类型',`users_id` int(11) NULL DEFAULT 0 COMMENT '用户ID',`sort_order` smallint(5) NULL DEFAULT 100 COMMENT '排序',`is_del` tinyint(1) NULL DEFAULT 0 COMMENT '1已删除 0未删除',`add_time` int(10) UNSIGNED NULL DEFAULT 0 COMMENT '上传时间',`update_time` int(11) NULL DEFAULT 0 COMMENT '更新时间',PRIMARY KEY (`img_id`) USING BTREE,INDEX `aid`(`aid`) USING BTREE,INDEX `add_time`(`add_time`) USING BTREE
) ENGINE = MyISAM AUTO_INCREMENT = 93 CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '上传记录表' ROW_FORMAT = Dynamic;CREATE TABLE `lenao75_com`.`ey_common_pic` (`id` int(11) UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '常用图片ID',`pic_path` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL DEFAULT '' COMMENT '图片地址',`lang` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT 'cn' COMMENT '多语言',`add_time` int(11) NOT NULL DEFAULT 0 COMMENT '新增时间',`update_time` int(11) NOT NULL DEFAULT 0 COMMENT '更新时间',PRIMARY KEY (`id`) USING BTREE
) ENGINE = MyISAM AUTO_INCREMENT = 103 CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '常用图片' ROW_FORMAT = Dynamic;CREATE TABLE `lenao75_com`.`ey_archives` (`aid` int(10) NOT NULL AUTO_INCREMENT,`typeid` int(10) NOT NULL DEFAULT 0 COMMENT '当前栏目',`stypeid` varchar(90) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '' COMMENT '副栏目ID集合',`channel` int(10) NOT NULL DEFAULT 0 COMMENT '模型ID',`is_b` tinyint(1) NULL DEFAULT 0 COMMENT '加粗',`title` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '' COMMENT '文档标题',`subtitle` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '' COMMENT '副标题',`litpic` varchar(250) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '' COMMENT '封面图片',`is_head` tinyint(1) NULL DEFAULT 0 COMMENT '头条(0=否,1=是)',`is_special` tinyint(1) NULL DEFAULT 0 COMMENT '特荐(0=否,1=是)',`is_top` tinyint(1) NULL DEFAULT 0 COMMENT '置顶(0=否,1=是)',`is_recom` tinyint(1) NULL DEFAULT 0 COMMENT '推荐(0=否,1=是)',`is_jump` tinyint(1) NULL DEFAULT 0 COMMENT '跳转链接(0=否,1=是)',`is_litpic` tinyint(1) NULL DEFAULT 0 COMMENT '图片(0=否,1=是)',`is_roll` tinyint(1) UNSIGNED NOT NULL DEFAULT 0 COMMENT '滚动(0=否,1=是)',`is_slide` tinyint(1) UNSIGNED NOT NULL DEFAULT 0 COMMENT '幻灯(0=否,1=是)',`is_diyattr` tinyint(1) UNSIGNED NOT NULL DEFAULT 0 COMMENT '自定义(0=否,1=是)',`origin` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '' COMMENT '来源',`author` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '' COMMENT '作者',`click` int(10) NULL DEFAULT 0 COMMENT '点击数',`arcrank` int(10) NULL DEFAULT 0 COMMENT '阅读权限:0=开放浏览,-1=待审核稿件',`jumplinks` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '' COMMENT '跳转网址',`ismake` tinyint(1) NULL DEFAULT 0 COMMENT '是否静态页面(0=动态,1=静态)',`seo_title` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '' COMMENT 'SEO标题',`seo_keywords` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '' COMMENT 'SEO关键词',`seo_description` text CHARACTER SET utf8 COLLATE utf8_general_ci NULL COMMENT 'SEO描述',`attrlist_id` int(10) UNSIGNED NOT NULL DEFAULT 0 COMMENT '参数列表ID',`merchant_id` int(11) UNSIGNED NOT NULL DEFAULT 0 COMMENT '多商家ID',`free_shipping` tinyint(1) UNSIGNED NOT NULL DEFAULT 0 COMMENT '商品是否包邮(1包邮(免运费) 0跟随系统)',`users_price` decimal(10, 2) NOT NULL DEFAULT 0.00 COMMENT '会员价',`crossed_price` decimal(10, 2) UNSIGNED NOT NULL DEFAULT 0.00 COMMENT '商品划线价',`users_discount_type` tinyint(1) UNSIGNED NOT NULL DEFAULT 0 COMMENT '产品会员折扣类型(0:系统默认折扣; 1:指定会员级别; 2:不参与折扣;)',`users_free` tinyint(1) UNSIGNED NOT NULL DEFAULT 0 COMMENT '是否会员免费,默认0不免费,1为免费',`old_price` decimal(10, 2) NOT NULL DEFAULT 0.00 COMMENT '产品旧价',`sales_num` int(10) NOT NULL DEFAULT 0 COMMENT '总销售量',`virtual_sales` int(10) NULL DEFAULT 0 COMMENT '商品虚拟销售量',`sales_all` int(10) NULL DEFAULT 0 COMMENT '虚拟总销量',`stock_count` int(10) UNSIGNED NOT NULL DEFAULT 0 COMMENT '商品库存量',`stock_show` tinyint(1) UNSIGNED NOT NULL DEFAULT 1 COMMENT '商品库存在产品详情页是否显示,1为显示,0为不显示',`prom_type` tinyint(1) UNSIGNED NULL DEFAULT 0 COMMENT '产品类型:0=普通产品,1=虚拟(默认手动发货),2=虚拟(网盘),3=虚拟(自定义文本) 4-核销',`logistics_type` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '1' COMMENT '商品物流支持类型(1: 物流配送; 2: 到店核销)',`tempview` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '' COMMENT '文档模板',`status` tinyint(1) NULL DEFAULT 1 COMMENT '状态(0=屏蔽,1=正常)',`sort_order` int(10) NULL DEFAULT 0 COMMENT '排序号',`lang` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT 'cn' COMMENT '语言标识',`admin_id` int(10) NULL DEFAULT 0 COMMENT '管理员ID',`users_id` int(10) NULL DEFAULT 0 COMMENT '会员ID',`arc_level_id` int(10) NULL DEFAULT 0 COMMENT '文档会员权限ID',`restric_type` tinyint(1) NULL DEFAULT 0 COMMENT '限制模式,0=免费,1=付费,2=会员专享,3=会员付费',`is_del` tinyint(1) NULL DEFAULT 0 COMMENT '伪删除,1=是,0=否',`del_method` tinyint(1) NULL DEFAULT 0 COMMENT '伪删除状态,1为主动删除,2为跟随上级栏目被动删除',`joinaid` int(10) NULL DEFAULT 0 COMMENT '关联文档ID',`downcount` int(10) NULL DEFAULT 0 COMMENT '下载次数',`appraise` int(10) NULL DEFAULT 0 COMMENT '评价数',`collection` int(10) NULL DEFAULT 0 COMMENT '收藏数',`htmlfilename` varchar(500) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '' COMMENT '自定义文件名',`province_id` int(10) NULL DEFAULT 0 COMMENT '省份',`city_id` int(10) NULL DEFAULT 0 COMMENT '所在城市',`area_id` int(10) NULL DEFAULT 0 COMMENT '所在区域',`add_time` int(11) NULL DEFAULT 0 COMMENT '新增时间',`update_time` int(11) NULL DEFAULT 0 COMMENT '更新时间',`no_vip_pay` tinyint(3) NULL DEFAULT 0 COMMENT 'restric_type = 2 时,会员专享,非会员可付费使用,0-关闭,1-开启',`editor_remote_img_local` tinyint(1) NULL DEFAULT 1 COMMENT '远程图片本地化',`editor_img_clear_link` tinyint(1) NULL DEFAULT 1 COMMENT '清除非本站链接',`reason` text CHARACTER SET utf8 COLLATE utf8_general_ci NULL COMMENT '退回原因',PRIMARY KEY (`aid`) USING BTREE,INDEX `add_time`(`add_time`) USING BTREE
) ENGINE = MyISAM AUTO_INCREMENT = 29 CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '文档主表' ROW_FORMAT = Dynamic;CREATE TABLE `lenao75_com`.`ey_travel_content` (`id` int(10) NOT NULL AUTO_INCREMENT,`aid` int(10) NULL DEFAULT 0 COMMENT '文档ID',`add_time` int(11) NULL DEFAULT 0 COMMENT '新增时间',`update_time` int(11) NULL DEFAULT 0 COMMENT '更新时间',`imgs` text CHARACTER SET utf8 COLLATE utf8_general_ci NULL COMMENT '图集|10001',`info` longtext CHARACTER SET utf8 COLLATE utf8_general_ci NULL COMMENT '行程介紹',`price` varchar(500) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL DEFAULT '价格' COMMENT '价格',`prince_info` longtext CHARACTER SET utf8 COLLATE utf8_general_ci NULL COMMENT '订价资料',`summary` longtext CHARACTER SET utf8 COLLATE utf8_general_ci NULL COMMENT '活动摘要',PRIMARY KEY (`id`) USING BTREE,INDEX `aid`(`aid`) USING BTREE
) ENGINE = MyISAM AUTO_INCREMENT = 3 CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '附加表' ROW_FORMAT = Dynamic;CREATE TABLE `lenao75_com`.`ey_admin_log` (`log_id` bigint(16) UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '表id',`admin_id` int(10) NOT NULL DEFAULT -1 COMMENT '管理员id',`log_info` text CHARACTER SET utf8 COLLATE utf8_general_ci NULL COMMENT '日志描述',`log_ip` varchar(30) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '' COMMENT 'ip地址',`log_url` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '' COMMENT 'url',`log_time` int(11) NULL DEFAULT 0 COMMENT '日志时间',PRIMARY KEY (`log_id`) USING BTREE,INDEX `admin_id`(`admin_id`) USING BTREE
) ENGINE = MyISAM AUTO_INCREMENT = 370 CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '管理员操作日志表' ROW_FORMAT = Dynamic;
数据入库sql:
INSERT INTO `ey_uploads_type` (`upload_type` , `add_time` , `update_time`) VALUES ('222' , 1762567311 , 1762567311)
INSERT INTO `ey_uploads` (`aid` , `type_id` , `image_url` , `title` , `intro` , `width` , `height` , `filesize` , `mime` , `users_id` , `sort_order` , `add_time` , `update_time`) VALUES (0 , 2 , '/uploads/allimg/20251108/2-25110Q00202642.png' , '企业微信截图_20251108095934.png' , '' , 997 , 663 , 1031788 , 'image/png' , 1 , 100 , 1762567322 , 1762567322)
INSERT INTO `ey_uploads` (`aid` , `type_id` , `image_url` , `title` , `intro` , `width` , `height` , `filesize` , `mime` , `users_id` , `sort_order` , `add_time` , `update_time`) VALUES (0 , 2 , '/uploads/allimg/20251108/2-25110Q002022c.png' , '企业微信截图_20251108100007.png' , '' , 998 , 666 , 1099904 , 'image/png' , 1 , 100 , 1762567322 , 1762567322)
INSERT INTO `ey_uploads` (`aid` , `type_id` , `image_url` , `title` , `intro` , `width` , `height` , `filesize` , `mime` , `users_id` , `sort_order` , `add_time` , `update_time`) VALUES (0 , 2 , '/uploads/allimg/20251108/2-25110Q002022M.png' , '企业微信截图_20251108100022.png' , '' , 998 , 665 , 1709756 , 'image/png' , 1 , 100 , 1762567322 , 1762567322)
INSERT INTO `ey_uploads` (`aid` , `type_id` , `image_url` , `title` , `intro` , `width` , `height` , `filesize` , `mime` , `users_id` , `sort_order` , `add_time` , `update_time`) VALUES (0 , 2 , '/uploads/allimg/20251108/2-25110Q00202418.png' , '企业微信截图_20251108100035.png' , '' , 997 , 663 , 1543067 , 'image/png' , 1 , 100 , 1762567322 , 1762567322)
INSERT INTO `ey_uploads` (`aid` , `type_id` , `image_url` , `title` , `intro` , `width` , `height` , `filesize` , `mime` , `users_id` , `sort_order` , `add_time` , `update_time`) VALUES (0 , 2 , '/uploads/allimg/20251108/2-25110Q002025c.png' , '企业微信截图_20251108100046.png' , '' , 993 , 663 , 1213989 , 'image/png' , 1 , 100 , 1762567322 , 1762567322)
INSERT INTO `ey_uploads` (`aid` , `type_id` , `image_url` , `title` , `intro` , `width` , `height` , `filesize` , `mime` , `users_id` , `sort_order` , `add_time` , `update_time`) VALUES (0 , 2 , '/uploads/allimg/20251108/2-25110Q00202G4.png' , '企业微信截图_20251108100055.png' , '' , 1001 , 667 , 1339834 , 'image/png' , 1 , 100 , 1762567322 , 1762567322)
INSERT INTO `ey_uploads` (`aid` , `type_id` , `image_url` , `title` , `intro` , `width` , `height` , `filesize` , `mime` , `users_id` , `sort_order` , `add_time` , `update_time`) VALUES (0 , 2 , '/uploads/allimg/20251108/2-25110Q00202913.png' , '企业微信截图_20251108100105.png' , '' , 998 , 665 , 1352028 , 'image/png' , 1 , 100 , 1762567322 , 1762567322)
INSERT INTO `ey_uploads` (`aid` , `type_id` , `image_url` , `title` , `intro` , `width` , `height` , `filesize` , `mime` , `users_id` , `sort_order` , `add_time` , `update_time`) VALUES (0 , 2 , '/uploads/allimg/20251108/2-25110Q00203955.png' , '企业微信截图_20251108100114.png' , '' , 999 , 664 , 1692118 , 'image/png' , 1 , 100 , 1762567323 , 1762567323)INSERT INTO `ey_common_pic` (`pic_path` , `lang` , `add_time` , `update_time`) VALUES ( '/uploads/allimg/20251108/2-25110Q00203955.png','cn',1762567346,1762567346 ) , ( '/uploads/allimg/20251108/2-25110Q00202913.png','cn',1762567346,1762567346 ) , ( '/uploads/allimg/20251108/2-25110Q00202G4.png','cn',1762567346,1762567346 ) , ( '/uploads/allimg/20251108/2-25110Q002025c.png','cn',1762567346,1762567346 ) , ( '/uploads/allimg/20251108/2-25110Q00202418.png','cn',1762567346,1762567346 ) , ( '/uploads/allimg/20251108/2-25110Q00202642.png','cn',1762567346,1762567346 ) , ( '/uploads/allimg/20251108/2-25110Q002022c.png','cn',1762567346,1762567346 ) , ( '/uploads/allimg/20251108/2-25110Q002022M.png','cn',1762567346,1762567346 )
INSERT INTO `ey_uploads` (`aid` , `type_id` , `image_url` , `title` , `intro` , `width` , `height` , `filesize` , `mime` , `users_id` , `sort_order` , `add_time` , `update_time`) VALUES (0 , 2 , '/uploads/allimg/20251108/2-25110Q0031b03.png' , 'aaaaaaaaaaaaaaaa_20251108100300.png' , '' , 150 , 150 , 43340 , 'image/png' , 1 , 100 , 1762567399 , 1762567399)
INSERT INTO `ey_common_pic` (`pic_path` , `lang` , `add_time` , `update_time`) VALUES ( '/uploads/allimg/20251108/2-25110Q0031b03.png','cn',1762567401,1762567401 )
INSERT INTO `ey_archives` (`title` , `subtitle` , `typeid` , `is_litpic` , `jumplinks` , `province_id` , `city_id` , `area_id` , `seo_title` , `seo_keywords` , `seo_description` , `author` , `origin` , `click` , `arcrank` , `add_time` , `htmlfilename` , `channel` , `litpic` , `is_b` , `is_head` , `is_special` , `is_recom` , `is_roll` , `is_slide` , `is_diyattr` , `editor_remote_img_local` , `editor_img_clear_link` , `is_jump` , `admin_id` , `lang` , `sort_order` , `crossed_price` , `users_price` , `old_price` , `update_time`) VALUES ('銀海-浪漫之旅' , '' , '10' , 1 , '' , '0' , 0 , 0 , '銀海-浪漫之旅' , '銀海-浪漫之旅' , '銀海-浪漫之旅' , '小编' , '网络' , '591' , '0' , 1762567107 , '' , 101 , '/uploads/allimg/20251108/2-25110Q0031b03.png' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 2 , 'cn' , 100 , 0 , 0 , 0 , 1762567107)
INSERT INTO `ey_travel_content` (`imgs` , `summary` , `price` , `prince_info` , `info` , `aid` , `add_time` , `update_time`) VALUES ('a:8:{i:0;a:2:{s:9:\"image_url\";s:45:\"/uploads/allimg/20251108/2-25110Q00203955.png\";s:5:\"intro\";s:0:\"\";}i:1;a:2:{s:9:\"image_url\";s:45:\"/uploads/allimg/20251108/2-25110Q00202913.png\";s:5:\"intro\";s:0:\"\";}i:2;a:2:{s:9:\"image_url\";s:44:\"/uploads/allimg/20251108/2-25110Q00202G4.png\";s:5:\"intro\";s:0:\"\";}i:3;a:2:{s:9:\"image_url\";s:44:\"/uploads/allimg/20251108/2-25110Q002025c.png\";s:5:\"intro\";s:0:\"\";}i:4;a:2:{s:9:\"image_url\";s:45:\"/uploads/allimg/20251108/2-25110Q00202418.png\";s:5:\"intro\";s:0:\"\";}i:5;a:2:{s:9:\"image_url\";s:45:\"/uploads/allimg/20251108/2-25110Q00202642.png\";s:5:\"intro\";s:0:\"\";}i:6;a:2:{s:9:\"image_url\";s:44:\"/uploads/allimg/20251108/2-25110Q002022c.png\";s:5:\"intro\";s:0:\"\";}i:7;a:2:{s:9:\"image_url\";s:44:\"/uploads/allimg/20251108/2-25110Q002022M.png\";s:5:\"intro\";s:0:\"\";}}' , '<ul class="fa-ul">\r\n \r\n <li class="mb-3">約4月1日-10中旬營業\r\n</li>\r\n \r\n \r\n <li class="mb-3">岐頭遊客中心報到\r\n</li>\r\n \r\n \r\n <li class="mb-3">行程約6~7小時\r\n</li>\r\n \r\n \r\n \r\n <li class="mb-3">開航時間約\r\n \r\n \r\n <br> AM 08:00-10:00\r\n \r\n \r\n \r\n <br>回航時間約\r\n\r\n <br>PM15:00-17:00\r\n\r\n \r\n </li>\r\n \r\n \r\n <li class="mb-3">開航前20分鐘辦理報到手續\r\n</li>\r\n \r\n \r\n <li class="mb-3">前一天專人會透過line通知具体開船時間(視潮汐及船班調度調整)</li>\r\n \r\n </ul>' , '1100' , '<ul class="fa-ul list-paddingleft-2"><li>成人(3歲以上同成人)NT$1,100</li><li>未滿3歲NT$100</li></ul>' , '<ul class=" list-paddingleft-2"><li>歧頭遊艇碼頭搭船出航~讓專業的為您導覽解說…</li><li>東海巡航:員貝後山三大景點介紹、澎澎灘會移動的小島、雞善嶼、錠鉤嶼</li><li>觀賞燕鷗:觀察燕鷗覓食俯衝入海</li><li>鳥嶼午餐:登島鳥嶼含午餐(海鮮粥+3道小菜)</li><li>澎澎灘水上活動: 1.透明獨木舟 2.SUP立獎 3.超大型水上溜滑梯 4.高空跳水 5.泰山式搖擺跳水 6.香蕉船 7.海戰車8.水上摩托車</li></ul>' , '28' , 1762569962 , 1762569962)
INSERT INTO `ey_admin_log` (`log_time` , `admin_id` , `log_info` , `log_ip` , `log_url`) VALUES (1762569962 , 2 , '新增数据:銀海-浪漫之旅' , '0.0.0.0' , '/login-in.php')
实现代码(基于thinkphp5):
<?php
namespace app\home\controller;
use think\Controller;
use QL\QueryList;
use think\Db;
use think\Log;// 设置更长的执行时间限制
ini_set('default_socket_timeout', 180);
set_time_limit(300); // 增加到5分钟
ini_set('max_execution_time', 300);class ProductSpider extends Controller
{/*** 构造函数,初始化错误处理*/public function __construct(){parent::__construct();// 全局禁用DOMDocument的HTML5错误报告libxml_use_internal_errors(true);// 设置错误报告级别,忽略E_DEPRECATED和E_NOTICEerror_reporting(E_ALL & ~E_DEPRECATED & ~E_NOTICE);}/*** 获取HTML页面内容* @param string $url* @return string|false*/private function getHtmlContent($url){// 创建HTTP流上下文$context = stream_context_create(['http' => ['timeout' => 30,'header' => "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"]]);return @file_get_contents($url, false, $context);}/*** 采集春霞旅游产品数据* @param string $url 产品URL* @return array*/public function spiderChunxiaProduct($url = 'https://www.chunxia.tw/products/dh-1.html'){try {// 使用file_get_contents获取页面内容,然后使用QueryList解析$html = $this->getHtmlContent($url);if (!$html) {throw new \Exception('无法获取页面内容');}// 使用QueryList解析HTML内容$ql = QueryList::Query($html, [], '', 'UTF-8', 'UTF-8', false);// 提取页面标题$pageTitle = $this->extractPageTitle($ql);// 提取活动摘要数据$activitySummary = $this->extractActivitySummary($ql);// 提取价格资料数据$priceData = $this->extractPriceData($ql);// 提取行程介绍数据$itineraryInfo = $this->extractItineraryInfo($ql);// 提取具体价格$price = $this->extractPrice($ql);// 提取并下载图片$images = $this->extractAndDownloadImages($ql, $url);// 返回采集结果$result = ['page_title' => $pageTitle,'activity_summary' => $activitySummary,'price_data' => $priceData,'itinerary_info' => $itineraryInfo,'price' => $price,'images' => $images,'source_url' => $url,'crawl_time' => date('Y-m-d H:i:s')];return $result;} catch (\Exception $e) {return ['error' => '采集失败: ' . $e->getMessage(),'source_url' => $url];}}/*** 提取页面标题* @param \QL\QueryList $ql* @return string*/private function extractPageTitle($ql){// 使用DOMDocument来处理HTML内容,添加HTML5兼容性$dom = new \DOMDocument();libxml_use_internal_errors(true); // 禁用错误报告@$dom->loadHTML('<!DOCTYPE html><html><head><meta charset="UTF-8"></head><body>' . $ql->getHtml() . '</body></html>');libxml_clear_errors(); // 清除错误$xpath = new \DOMXPath($dom);// 获取文章标题:/html/body/main/div/div/div/article/header/h1$titleNodes = $xpath->query('/html/body/main/div/div/div/article/header/h1');if ($titleNodes->length > 0) {$title = trim($titleNodes->item(0)->textContent);return $title;}// 如果XPath没找到标题,回退到从URL提取return $this->extractTitleFromUrl($this->source_url ?? '');}/*** 提取活动摘要数据* @param \QL\QueryList $ql* @return array*/private function extractActivitySummary($ql){// 使用DOMDocument来处理HTML内容,添加HTML5兼容性$dom = new \DOMDocument();libxml_use_internal_errors(true); // 禁用错误报告@$dom->loadHTML('<!DOCTYPE html><html><head><meta charset="UTF-8"></head><body>' . $ql->getHtml() . '</body></html>');libxml_clear_errors(); // 清除错误$xpath = new \DOMXPath($dom);// 获取ul元素内的HTML内容$ulNodes = $xpath->query('/html/body/main/div/div/div/article/div[2]/ul[1]');$ulHtml = '';if ($ulNodes->length > 0) {$ulHtml = $dom->saveHTML($ulNodes->item(0));}// 获取ul内的所有li元素$liNodes = $xpath->query('/html/body/main/div/div/div/article/div[2]/ul[1]/li');$summaryData = [];foreach ($liNodes as $liNode) {$text = trim($liNode->textContent);$html = $dom->saveHTML($liNode);$summaryData[] = ['text' => $text,'html' => trim($html)];}return ['ul_html' => $ulHtml,'li_items' => $summaryData,'item_count' => count($summaryData)];}/*** 提取价格资料数据* @param \QL\QueryList $ql* @return array*/private function extractPriceData($ql){// 使用DOMDocument来处理HTML内容,添加HTML5兼容性$dom = new \DOMDocument();libxml_use_internal_errors(true); // 禁用错误报告@$dom->loadHTML('<!DOCTYPE html><html><head><meta charset="UTF-8"></head><body>' . $ql->getHtml() . '</body></html>');libxml_clear_errors(); // 清除错误$xpath = new \DOMXPath($dom);// 获取ul元素内的HTML内容$ulNodes = $xpath->query('/html/body/main/div/div/div/article/div[2]/ul[2]');$ulHtml = '';if ($ulNodes->length > 0) {$ulHtml = $dom->saveHTML($ulNodes->item(0));}// 获取ul内的所有li元素$liNodes = $xpath->query('/html/body/main/div/div/div/article/div[2]/ul[2]/li');$priceData = [];foreach ($liNodes as $liNode) {$text = trim($liNode->textContent);$html = $dom->saveHTML($liNode);$priceData[] = ['text' => $text,'html' => trim($html)];}return ['ul_html' => $ulHtml,'li_items' => $priceData,'item_count' => count($priceData)];}/*** 提取行程介绍数据* @param \QL\QueryList $ql* @return array*/private function extractItineraryInfo($ql){// 使用DOMDocument来处理HTML内容,添加HTML5兼容性$dom = new \DOMDocument();libxml_use_internal_errors(true); // 禁用错误报告@$dom->loadHTML('<!DOCTYPE html><html><head><meta charset="UTF-8"></head><body>' . $ql->getHtml() . '</body></html>');libxml_clear_errors(); // 清除错误$xpath = new \DOMXPath($dom);// 获取行程介绍文本$itineraryText = ''; $divNodes = $xpath->query('/html/body/main/div/div/div/article/div[2]/ul[3]');if ($divNodes->length > 0) {$itineraryText = trim($divNodes->item(0)->textContent);}return ['text' => $itineraryText,'html' => $divNodes->length > 0 ? $dom->saveHTML($divNodes->item(0)) : ''];}/*** 提取具体价格* @param \QL\QueryList $ql* @return string*/private function extractPrice($ql){// 使用DOMDocument来处理HTML内容,添加HTML5兼容性$dom = new \DOMDocument();libxml_use_internal_errors(true); // 禁用错误报告@$dom->loadHTML('<!DOCTYPE html><html><head><meta charset="UTF-8"></head><body>' . $ql->getHtml() . '</body></html>');libxml_clear_errors(); // 清除错误$xpath = new \DOMXPath($dom);// 尝试提取价格信息$priceNodes = $xpath->query('/html/body/main/div/div/div/article/div[1]/div[3]/span[1]');if ($priceNodes->length > 0) {return trim($priceNodes->item(0)->textContent);}return '价格待定';}/*** 提取并下载图片* @param \QL\QueryList $ql* @param string $sourceUrl* @return array*/private function extractAndDownloadImages($ql, $sourceUrl){try {// 使用DOMDocument来处理HTML内容,添加HTML5兼容性$dom = new \DOMDocument();libxml_use_internal_errors(true); // 禁用错误报告@$dom->loadHTML('<!DOCTYPE html><html><head><meta charset="UTF-8"></head><body>' . $ql->getHtml() . '</body></html>');libxml_clear_errors(); // 清除错误$xpath = new \DOMXPath($dom);// 查找轮播图容器,然后获取其中的所有图片$carouselContainer = $xpath->query('//*[@id="carouse1"]/div[2]');$downloadedImages = [];if ($carouselContainer->length > 0) {// 获取轮播图容器内的所有img标签$imgNodes = $xpath->query('.//img', $carouselContainer->item(0));// 创建上传目录$uploadDir = ROOT_PATH . DS . 'uploads' . DS . 'allimg';if (!is_dir($uploadDir)) {mkdir($uploadDir, 0755, true);}$targetDir = $uploadDir . DS . date('Ymd');if (!is_dir($targetDir)) {mkdir($targetDir, 0755, true);}foreach ($imgNodes as $index => $imgNode) {$src = $imgNode->getAttribute('src');// 跳过空链接和数据URIif (empty($src) || strpos($src, 'data:') === 0) {continue;}// 处理相对路径if (strpos($src, 'http') !== 0) {$baseUrl = parse_url($sourceUrl, PHP_URL_SCHEME) . '://' . parse_url($sourceUrl, PHP_URL_HOST);if (strpos($src, '/') === 0) {$src = $baseUrl . $src;} else {$src = $baseUrl . '/' . $src;}}// 下载图片$downloadedImage = $this->downloadImage($src, $targetDir, $index + 1);if ($downloadedImage) {$downloadedImages[] = $downloadedImage;}}$images = ['total_count' => count($downloadedImages),'downloaded_images' => $downloadedImages,'target_dir' => $targetDir];}} catch (\Exception $e) {\think\Log::error('图片提取失败: ' . $e->getMessage());$images = ['error' => '图片提取失败: ' . $e->getMessage(),'total_count' => 0,'downloaded_images' => []];}return $images;}/*** 下载单个图片* @param string $imageUrl* @param string $targetDir* @param int $index* @return array|null*/private function downloadImage($imageUrl, $targetDir, $index){try {// 获取图片扩展名$pathInfo = pathinfo($imageUrl);$extension = isset($pathInfo['extension']) ? $pathInfo['extension'] : 'jpg';// 限制扩展名长度$extension = substr($extension, 0, 4);// 生成文件名$fileName = 'image_' . $index . '_' . time() . '_' . mt_rand(1000, 9999) . '.' . $extension;$filePath = $targetDir . '/' . $fileName;// 下载图片$context = stream_context_create(['http' => ['timeout' => 30,'header' => "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"]]);$imageData = @file_get_contents($imageUrl, false, $context);if ($imageData !== false && file_put_contents($filePath, $imageData)) {return ['original_url' => $imageUrl,'saved_path' => $filePath,'file_name' => $fileName,'file_size' => filesize($filePath),'download_time' => date('Y-m-d H:i:s')];}} catch (\Exception $e) {\think\Log::error('下载图片失败: ' . $e->getMessage() . ' URL: ' . $imageUrl);}return null;}/*** 测试采集功能 - 从products.txt文件中读取链接并循环采集* @return mixed*/public function testSpider(){// 读取products.txt文件$productsFile = ROOT_PATH . 'products.txt';if (!file_exists($productsFile)) {return json(['error' => 'products.txt文件不存在']);}// 读取文件内容$fileContent = file_get_contents($productsFile);if ($fileContent === false) {return json(['error' => '无法读取products.txt文件']);}// 按行分割URL(修正了分隔符)$urls = array_filter(array_map('trim', explode(PHP_EOL, $fileContent)));if (empty($urls)) {return json(['error' => 'products.txt文件中没有有效的URL']);}$results = [];$successCount = 0;$errorCount = 0;foreach ($urls as $index => $url) {// 跳过空行if (empty($url)) {continue;}// 记录当前采集进度$progress = "正在采集第 " . ($index + 1) . " 个链接,共 " . count($urls) . " 个: " . $url;\think\Log::info($progress);// 等待页面加载10秒\think\Log::info("等待页面加载10秒...");sleep(10);// 执行采集$result = $this->spiderChunxiaProduct($url);// 保存到数据库if (!isset($result['error'])) {$this->saveToDatabase($result);$successCount++;\think\Log::info("成功采集并保存: " . $url);} else {$errorCount++;\think\Log::error("采集失败: " . $url . " - " . $result['error']);}$results[] = ['url' => $url,'result' => $result];// 在采集多个链接之间添加短暂延迟,避免过于频繁请求if ($index < count($urls) - 1) {sleep(2);}}return json(['summary' => ['total_urls' => count($urls),'success_count' => $successCount,'error_count' => $errorCount],'results' => $results]);}/*** 保存采集数据到数据库* @param array $data* @return bool*/private function saveToDatabase($data){try {// 检查数据库中是否已存在相同URL的数据// 使用从页面提取的标题,如果没有则回退到URL提取$title = isset($data['page_title']) && !empty($data['page_title']) ? $data['page_title'] : $this->extractTitleFromUrl($data['source_url']);Log::info("title:".$title);// 首先查询ey_archives表获取aid$archive = Db::name('archives')->where('title', 'LIKE', '%' . $title . '%')->find();Log::info("archive:", json_encode($archive));$currentTime = time();if ($archive) {// 如果找到archives记录,检查travel_content表是否有对应记录$existing = Db::name('travel_content')->where('aid', $archive['aid'])->find();Log::record(json_encode($existing));if ($existing) {Log::record(json_encode($data));// 更新现有记录$this->updateExistingRecord($existing, $data, $currentTime);} else {Log::record("插入记录");// 插入新记录到travel_content表$this->insertTravelContentRecord($archive['aid'], $data, $currentTime);}} else {// 插入新记录(包括archives和travel_content)$this->insertNewRecord($data, $currentTime);}return true;} catch (\Exception $e) {// 记录错误日志\think\Log::error('保存采集数据失败: ' . $e->getMessage());return false;}}/*** 从URL中提取标题* @param string $url* @return string*/private function extractTitleFromUrl($url){// 从URL中提取产品名称$urlParts = explode('/', $url);$lastPart = end($urlParts);$fileName = pathinfo($lastPart, PATHINFO_FILENAME);// 这里可以根据需要添加更多的产品名称映射逻辑$titleMap = ['dh-1' => '银海-浪漫之旅'];return isset($titleMap[$fileName]) ? $titleMap[$fileName] : $fileName;}/*** 更新现有记录* @param array $existing* @param array $data* @param int $currentTime*/private function updateExistingRecord($existing, $data, $currentTime){// 更新travel_content表$travelContentData = ['summary' => $this->formatActivitySummary($data['activity_summary']),'price' => $data['price'] ?: '价格','prince_info' => $this->formatPriceData($data['price_data']),'info' => $this->formatItineraryInfo($data['itinerary_info']),'imgs' => $this->formatImagesData($data['images']),'update_time' => $currentTime];Db::name('travel_content')->where('aid', $existing['aid'])->update($travelContentData);// 记录操作日志$this->logAdminAction($existing['aid'], '更新数据', '从春霞旅游网站更新产品数据');}/*** 插入travel_content表记录(已有archives记录的情况)* @param int $aid* @param array $data* @param int $currentTime*/private function insertTravelContentRecord($aid, $data, $currentTime){$travelContentData = ['aid' => $aid,'summary' => $this->formatActivitySummary($data['activity_summary']),'price' => $data['price'] ?: '价格','prince_info' => $this->formatPriceData($data['price_data']),'info' => $this->formatItineraryInfo($data['itinerary_info']),'imgs' => $this->formatImagesData($data['images']),'add_time' => $currentTime,'update_time' => $currentTime];Db::name('travel_content')->insert($travelContentData);// 记录操作日志$this->logAdminAction($aid, '新增travel_content数据', '从春霞旅游网站采集产品数据');}/*** 插入新记录* @param array $data* @param int $currentTime*/private function insertNewRecord($data, $currentTime){// 1. 首先插入到archives表// 使用从页面提取的标题,如果没有则回退到URL提取$title = isset($data['page_title']) && !empty($data['page_title']) ? $data['page_title'] : $this->extractTitleFromUrl($data['source_url']);// 获取第一张图片路径$litpicPath = $this->getFirstImagePath($data['images']);$archiveData = ['title' => $title,'typeid' => 17, // 根据你的数据,typeid为10'channel' => 101, // 旅游产品频道'is_litpic' => 1,'litpic' => $litpicPath, // 添加litpic字段'seo_title' => $title,'seo_keywords' => $title,'seo_description' => $title,'author' => '小编','origin' => '网络','click' => 0,'arcrank' => 0,'status' => 1,'sort_order' => 100,'lang' => 'cn','admin_id' => 2,'crossed_price' => 0,'users_price' => 0,'old_price' => 0,'add_time' => $currentTime,'update_time' => $currentTime];$aid = Db::name('archives')->insertGetId($archiveData);// 2. 然后插入到travel_content表$travelContentData = ['aid' => $aid,'summary' => $this->formatActivitySummary($data['activity_summary']),'price' => $data['price'] ?: '价格','prince_info' => $this->formatPriceData($data['price_data']),'info' => $this->formatItineraryInfo($data['itinerary_info']),'imgs' => $this->formatImagesData($data['images']),'add_time' => $currentTime,'update_time' => $currentTime];Db::name('travel_content')->insert($travelContentData);// 记录操作日志$this->logAdminAction($aid, '新增数据', '从春霞旅游网站采集产品数据');}/*** 格式化活动摘要数据* @param array $activitySummary* @return string*/private function formatActivitySummary($activitySummary){if (empty($activitySummary['li_items'])) {return '';}$formatted = '';foreach ($activitySummary['li_items'] as $item) {// 使用htmlspecialchars转义HTML内容$formatted .= htmlspecialchars($item['html'], ENT_QUOTES, 'UTF-8') . "\n";}return $formatted;}/*** 格式化价格资料数据* @param array $priceData* @return string*/private function formatPriceData($priceData){if (empty($priceData['li_items'])) {return '';}$formatted = '';foreach ($priceData['li_items'] as $item) {// 使用htmlspecialchars转义HTML内容$formatted .= htmlspecialchars($item['html'], ENT_QUOTES, 'UTF-8') . "\n";}return $formatted;}/*** 格式化行程介绍数据* @param array $itineraryInfo* @return string*/private function formatItineraryInfo($itineraryInfo){// 使用htmlspecialchars转义HTML内容return htmlspecialchars($itineraryInfo['html'] ?? '', ENT_QUOTES, 'UTF-8');}/*** 格式化图片数据用于存储到数据库* @param array $imagesData* @return string*/private function formatImagesData($imagesData){if (!isset($imagesData['downloaded_images']) || empty($imagesData['downloaded_images'])) {return '';}$formattedImages = [];foreach ($imagesData['downloaded_images'] as $index => $image) {// 检查saved_path是否存在并转换为相对路径if (isset($image['saved_path'])) {$imagePath = $this->convertToRelativePath($image['saved_path']);$formattedImages[] = ['image_url' => $imagePath,'intro' => '' // 可以为空,如果需要描述可以后续添加];}}// 序列化数组return serialize($formattedImages);}/*** 获取第一张图片路径用于litpic字段* @param array $imagesData* @return string*/private function getFirstImagePath($imagesData){if (!isset($imagesData['downloaded_images']) || empty($imagesData['downloaded_images'])) {return '';}// 获取第一张图片$firstImage = $imagesData['downloaded_images'][0];// 检查saved_path是否存在并转换为相对路径if (isset($firstImage['saved_path'])) {return $this->convertToRelativePath($firstImage['saved_path']);}return '';}/*** 将绝对路径转换为相对路径* @param string $absolutePath* @return string*/private function convertToRelativePath($absolutePath){// 首先尝试相对于public目录的转换$publicPath = ROOT_PATH . 'public' . DS;if (strpos($absolutePath, $publicPath) === 0) {// 如果是Windows路径,先标准化$relativePath = str_replace($publicPath, '', $absolutePath);// 将路径分隔符统一为斜杠$relativePath = str_replace('\\', '/', $relativePath);// 确保路径以斜杠开头if (strpos($relativePath, '/') !== 0) {$relativePath = '/' . $relativePath;}return $relativePath;}// 其次尝试相对于项目根目录的转换(用于uploads/allimg目录)$rootPath = ROOT_PATH;if (strpos($absolutePath, $rootPath) === 0) {// 如果是Windows路径,先标准化$relativePath = str_replace($rootPath, '', $absolutePath);// 将路径分隔符统一为斜杠$relativePath = str_replace('\\', '/', $relativePath);// 确保路径以斜杠开头if (strpos($relativePath, '/') !== 0) {$relativePath = '/' . $relativePath;}return $relativePath;}// 如果已经是相对路径,直接返回return $absolutePath;}/*** 记录管理员操作日志* @param int $aid* @param string $action* @param string $description*/private function logAdminAction($aid, $action, $description){\think\Log::record("管理员操作:{$action} - AID: {$aid} - {$description}");}
}
数据eyoucms数据采集任务:起步价格1880

