nodejs爬虫-使用puppeteer 爬取图片数据

日期: 2019-10-19         浏览量: 7305

简单写了一个爬虫(主要是使用 puppeteer),爬取图片信息,分享给大家 。


const puppeteer = require('puppeteer');
const fs = require('fs');
const request = require('request');
const path = require('path');

// 爬取 蘑菇街 童装图片数据
(async () => {
  let browser = await puppeteer.launch();
  let page = await browser.newPage();
  await page.goto('https://list.mogu.com/search/goods?q=裙子');

  let items = await page.$$eval('a.pin-box', el => el.map(x => x.getAttribute("href")));   // 获取列表 a 链接 href

  // 轮训获取详情
  for (let item of items) {
    await page.goto(item);
    let title = await page.$('span.title');
    let name = await (await title.getProperty('innerText')).jsonValue(); // 获取商品名称
    let images = await page.$$eval('li.img>img', el => el.map(x => x.getAttribute('src')));  // 展示图片
    let detailImg = await page.$$eval('img.lazy', el => el.map(x => x.getAttribute('data-original')));  // 详情图片
    mkdirSync(`./product/${name}/images/`);        // 存放目录
    mkdirSync(`./product/${name}/detailImg/`);     // 存放目录
    for (i of images) {
      let src = i.replace(/_100x100.jpg/, '');
      await downloadImg(src, './product/' + name + '/images/' + new Date().getTime() + '.jpg');
    }
    for (m of detailImg) {
      await downloadImg(m, './product/' + name + '/detailImg/' + new Date().getTime() + '.jpg');
      //await request(m).pipe(fs.createWriteStream('./product/' + name + '/detailImg/' + new Date().getTime() + '.jpg'));
    }
  }
  // 关闭
  await browser.close();
})();


// 同步创建目录
function mkdirSync(dirname) {
  if (fs.existsSync(dirname)) {
    return true;
  } else {
    if (mkdirSync(path.dirname(dirname))) {
      fs.mkdirSync(dirname);
      return true;
    }
  }
  return false
}

// 下载文件 保存图片
async function downloadImg(src, path) {
  return new Promise(async function (resolve, reject) {
    let writeStream = fs.createWriteStream(path);
    let readStream = await request(src);
    await readStream.pipe(writeStream);
    readStream.on('end', function () {
      console.log('文件下载成功');
    });
    readStream.on('error', function () {
      console.log("错误信息:" + err)
    })
    writeStream.on("finish", function () {
      console.log("文件写入成功");
      writeStream.end();
      resolve();
    });
  })
}