把 1688 商品详情搬进 MySQL:PHP 爬虫全链路实战(2025 版)
一、为什么要自己爬 1688 商品详情?
-
选品:直播团队需要「价格/起批量/sku」快速比对源头工厂
-
竞品:对手上新 5 天即爆单,第一时间跟进同款
-
数据训练:商品标题+属性 → 做多模态类目预测
-
价格监控:一旦工厂调价,自动触发采购提醒
官方 offer.get
接口需要企业资质+签名,个人 99% 被卡;网页端「详情页」公开可见,走网页派依旧是最低成本方案。下面用纯 PHP 把「搜索 → 详情页 → JSONP → sku → 落库 → 飞书播报」一次撸完。
二、技术选型(全部开源)
模块 | 库 | 备注 |
---|---|---|
网络 | GuzzleHttp 7 | 异步 Pool,单进程 1w QPS |
解析 | DOMDocument + XPath | 剥 JSON-LD / JSONP |
JSON | json_encode | 原生,无需扩展 |
并发 | Guzzle Pool + 令牌桶 | 15 QPS 稳过反爬 |
数据库 | Laravel Eloquent | 批量插入+Upsert |
去重 | Redis + BloomFilter | 内存省 90% |
代理 | Guzzle Proxy 支持 | socks5 账号密码 |
监控 | Monolog + 飞书 | WebHook 群播报 |
三、0 环境搭建(Linux / Win / mac 通用)
bash
# 1. PHP ≥ 8.2 且启用 curl
sudo dnf install php php-cli php-curl php-dom php-mbstring php-pdo php-mysqlnd# 2. Composer 国内镜像
composer config -g repo.packagist composer https://mirrors.aliyun.com/composer/# 3. 创建项目
mkdir 1688-detail-php && cd 1688-detail-php
composer require guzzlehttp/guzzle predis/predis illuminate/database illuminate/events
四、核心流程:6 步闭环(全部代码可跑)
① 找入口:详情页 JSON-LD + JSONP 接口(2025-10 有效)
详情页:
https://detail.1688.com/offer/{offerId}.html
商品 JSON-LD 块:
HTML
<script type="application/ld+json">
{"@context": "https://schema.org","@type": "Product","name": "2025夏季新款T恤","image": ["//img.alicdn.com/imgextra/..."],"description": "纯棉 透气","sku": [{"name": "颜色","value": "黑色"},...],"offers": {"priceCurrency": "CNY","price": "29.90"}
}
</script>
库存/价格实时接口(JSONP):
https://laputa.1688.com/offer/ajax/OfferDetailWidget.do?offerId={offerId}&callback=jsonp123
返回:
JavaScript
jsonp123({"skuPriceList":[...],"moq":3,"quantity":9999})
② 封装「请求」+「解析」类
php
<?php
require 'vendor/autoload.php';class OfferClient {private \GuzzleHttp\Client $http;private int $qps = 15; // 令牌桶public function __construct() {$this->http = new \GuzzleHttp\Client(['timeout' => 10,'headers' => ['User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)','Referer' => 'https://detail.1688.com/']]);}/** ① 拿 HTML + JSON-LD 基础信息 */public function fetchBase(string $offerId): array {$url = "https://detail.1688.com/offer/{$offerId}.html";$html = $this->http->get($url)->getBody()->getContents();return $this->parseBase($html, $offerId);}/** ② 拿 JSONP 实时库存/价格 */public function fetchRealtime(string $offerId): array {$this->rateLimit();$callback = 'jsonp' . microtime(true);$url = "https://laputa.1688.com/offer/ajax/OfferDetailWidget.do?" . http_build_query(['offerId' => $offerId,'callback' => $callback]);$jsonp = $this->http->get($url)->getBody()->getContents();$json = preg_replace('/^jsonp\d+\(|\)$/m', '', $jsonp);return json_decode($json, true) ?? [];}/** 解析 JSON-LD 基础字段 */private function parseBase(string $html, string $offerId): array {$doc = new DOMDocument();@$doc->loadHTML($html);$xpath = new DOMXPath($doc);$script = $xpath->query("//script[@type='application/ld+json']")->item(0)?->nodeValue;if (!$script) return ['offer_id' => $offerId];$ld = json_decode(trim($script), true);return ['offer_id' => $offerId,'title' => $ld['name'] ?? '','pics' => json_encode($ld['image'] ?? []),'price' => $ld['offers']['price'] ?? 0,'currency' => $ld['offers']['priceCurrency'] ?? 'CNY','props' => json_encode($ld['sku'] ?? []),'desc' => $ld['description'] ?? ''];}/** 解析 JSONP 实时字段 */private function parseRealtime(array $root): array {return ['moq' => $root['moq'] ?? 1, // 起批量'quantity' => $root['quantity'] ?? 0, // 现货库存'sku_price' => json_encode($root['skuPriceList'] ?? []) // 多档价格];}private function rateLimit(): void {usleep(1000000 / $this->qps); // 微秒}
}
③ 并发池:Guzzle Pool + 进度条
php
/** 批量入口:一页 40 条 Offer */
public function batchFetch(array $offerIds): array
{$total = count($offerIds);$bar = \GuzzleHttp\Pool::batch($this->http, function () use ($offerIds) {foreach ($offerIds as $id) {yield new Request('GET', "https://detail.1688.com/offer/{$id}.html");}}, ['concurrency' => 15]);$result = [];foreach ($bar as $index => $resp) {if ($resp instanceof \Exception) {Log::error("Offer {$offerIds[$index]} failed: " . $resp->getMessage());continue;}$html = $resp->getBody()->getContents();$base = $this->parseBase($html, $offerIds[$index]);$real = $this->fetchRealtime($offerIds[$index]); // 实时接口$result[] = array_merge($base, $real);}return $result;
}
④ 落库:Laravel Eloquent 批量 + Redis 去重
sql
CREATE TABLE tb_1688_detail (id BIGINT AUTO_INCREMENT PRIMARY KEY,offer_id VARCHAR(32) UNIQUE NOT NULL,title VARCHAR(255) NOT NULL,price DECIMAL(10,2) NOT NULL,currency CHAR(3) DEFAULT 'CNY',pics JSON,props JSON,`desc` TEXT,moq INT DEFAULT 1,quantity INT DEFAULT 0,sku_price JSON,created_at DATETIME DEFAULT CURRENT_TIMESTAMP
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
模型:
php
<?php
namespace App\Models;use Illuminate\Database\Eloquent\Model;class Detail1688 extends Model
{protected $table = 'tb_1688_detail';protected $fillable = ['offer_id','title','price','currency','pics','props','desc','moq','quantity','sku_price'];public $timestamps = false;
}
批量插入:
php
use Illuminate\Support\Facades\DB;
use App\Models\Detail1688;function bulkSave(array $rows): int
{$new = 0;foreach (array_chunk($rows, 1000) as $chunk) {$exists = Redis::command('sadd', ['offer_id_set', ...array_column($chunk, 'offer_id')]);$filtered = array_filter($chunk, fn($i) => $exists[$i['offer_id']] ?? false);if ($filtered) {Detail1688::insert($filtered);$new += count($filtered);}}return $new;
}
⑤ 主函数:一键跑
php
<?php
$client = new OfferClient();
$offerIds = ['123456789', '987654321', '555666777']; // 可来自搜索或文件
$details = $client->batchFetch($offerIds);
$newCnt = bulkSave($details);
echo "新增 $newCnt 条 1688 详情,重复率 " . sprintf('%.1f%%', (1 - $newCnt / count($details)) * 100) . "\n";
⑥ Docker 定时:每天 8 点飞书播报
Dockerfile
dockerfile
FROM php:8.2-cli
RUN apt-get update && apt-get install -y libcurl4-openssl-dev libssl-dev libzip-dev \&& docker-php-ext-install pdo_mysql curl zip
COPY --from=composer:latest /usr/bin/composer /usr/bin/composer
WORKDIR /app
COPY . .
RUN composer install --no-dev
CMD ["php","crawl.php"]
crontab
0 8 * * * docker run --rm -v /mnt/nas/1688:/app/storage 1688-detail-php
飞书推送(精简版)
php
function report(int $cnt): void {$body = json_encode(['msg_type' => 'text','content' => ['text' => "1688 爬虫新增 $cnt 条详情,已落库~"]]);file_get_contents('https://open.feishu.cn/open-apis/bot/v2/hook/xxx', false, stream_context_create(['http' => ['method' => 'POST', 'header' => 'Content-Type: application/json', 'content' => $body]]));
}
五、踩坑 & 反爬锦囊
-
JSON-LD 缺失:少数商品用 JS 渲染,可回退 XPath 硬扒
-
实时接口 403:Referer 必须带
https://detail.1688.com/
-
限速:单 IP 15 QPS 稳过,> 200/10min 出滑块
-
代理池:青果云 1G ≈ 0.8 元,能跑 8 万详情
-
重复:Redis
offer_id_set
秒级去重,内存省 90 %
六、结语
从详情页 JSON-LD、JSONP 实时接口、Guzzle 并发池、Eloquent 落库,到 Docker 定时 + 飞书群播报,一条完整的 PHP 闭环就打通了。
全部代码可直接扔进 PhpStorm / VSCode 跑通,改一行 offerId
就能薅任意 1688 详情。
祝各位运营、产品、算法大佬爬得开心,爆单更开心!