使用puppeteer爬取spa单页(vue/react)
编辑于 2021-09-15 18:04:28 阅读 2184
docker 部署 puppeteer
官方提供的Dockerfile1️⃣
FROM node:12-slim
# Install latest chrome dev package and fonts to support major charsets (Chinese, Japanese, Arabic, Hebrew, Thai and a few others)
# Note: this installs the necessary libs to make the bundled version of Chromium that Puppeteer
# installs, work.
RUN apt-get update \
&& apt-get install -y wget gnupg \
&& wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
&& sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \
&& apt-get update \
&& apt-get install -y google-chrome-stable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-freefont-ttf libxss1 \
--no-install-recommends \
&& rm -rf /var/lib/apt/lists/*
# If running Docker >= 1.13.0 use docker run's --init arg to reap zombie processes, otherwise
# uncomment the following lines to have `dumb-init` as PID 1
# ADD https://github.com/Yelp/dumb-init/releases/download/v1.2.2/dumb-init_1.2.2_x86_64 /usr/local/bin/dumb-init
# RUN chmod +x /usr/local/bin/dumb-init
# ENTRYPOINT ["dumb-init", "--"]
# Uncomment to skip the chromium download when installing puppeteer. If you do,
# you'll need to launch puppeteer with:
# browser.launch({executablePath: 'google-chrome-stable'})
# ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD true
# Install puppeteer so it's available in the container.
RUN npm init -y && \
npm i puppeteer \
# Add user so we don't need --no-sandbox.
# same layer as npm install to keep re-chowned files from using up several hundred MBs more space
&& groupadd -r pptruser && useradd -r -g pptruser -G audio,video pptruser \
&& mkdir -p /home/pptruser/Downloads \
&& chown -R pptruser:pptruser /home/pptruser \
&& chown -R pptruser:pptruser /node_modules \
&& chown -R pptruser:pptruser /package.json \
&& chown -R pptruser:pptruser /package-lock.json
# Run everything after as non-privileged user.
USER pptruser
CMD ["google-chrome-stable"]
构建
# 需要挂代理,我build了一个,并上传到了阿里云,可以直接使用:registry.cn-hangzhou.aliyuncs.com/cuiw/puppeteer-chrome-linux:20210916
cd puppeteer
docker build -t puppeteer-chrome-linux .
puppeteer/server.js
获取ks的视频链接,接收一个url参数(在ks中分享出来的链接)
const http = require("http");
const puppeteer = require('puppeteer');
const server = http.createServer(function (req, res) {
const urlObj = new URL('http://localhost:3000' + req.url)
// console.log(urlObj)
let url = urlObj.searchParams.get('url');
if (null !== url && url.length > 0) {
if (urlObj.pathname === '/ks/video') {
(async () => {
const browser = await puppeteer.launch({
args:['--no-sandbox', '--disable-setuid-sandbox'],
});
const page = await browser.newPage();
// 设置浏览器信息
await page.emulate(puppeteer.devices['iPhone X']);
await page.goto(url);
let video = await page.$eval('#video-player', el => el.src);
console.log(video)
res.writeHead(200, {"Content-Type": "application/json"});
let json = JSON.stringify({'url': video });
res.end(json);
await browser.close();
})();
}
}else res.end();
});
server.listen(3000);
docker-compose.yml
version: '3'
networks:
puppeteer:
services:
puppeteer:
# registry.cn-hangzhou.aliyuncs.com/cuiw/puppeteer-chrome-linux:20210916
image: puppeteer-chrome-linux
container_name: puppeteer
command: bash -c "/usr/local/bin/node /app/server.js"
ports:
- 3000:3000
volumes:
- ./puppeteer:/app
networks:
- puppeteer
# entrypoint: ["sh", "-c", "sleep infinity"]
测试
http://localhost:3000/ks/video?url=https://v.kuaishou.com/dlFLDr
# 如果没意外,将会输出
{"url":"https://..."}
其他相关
https://hub.docker.com/r/zenato/puppeteer-renderer
备注
1️⃣https://github.com/puppeteer/puppeteer/blob/main/docs/troubleshooting.md#running-puppeteer-in-docker