Python3抓取淘宝产品

最新推荐文章于 2020-06-14 09:46:25 发布

weixin_30699831

最新推荐文章于 2020-06-14 09:46:25 发布

阅读量300

点赞数

文章标签： python 开发工具

原文链接：http://www.cnblogs.com/roseredff/p/10142106.html

版权

# _*_ coding:utf-8 _*_
import random

import requests
from bs4 import BeautifulSoup


def crawl_tb_product():
    """
    抓取淘宝天猫产品
    :return:
    """
    # 淘宝天猫产品链接
    url = 'https://www.tmall.com/mlist/cp_bGFibyBsYWJvILPH0rDSvcn6.html'
    # 浏览器伪装
    headers = {
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0'
    }
    # 利用代理
    proxys = ['61.135.217.7    80',
              '58.56.108.226    43296',
              '58.240.232.126    57505',
              '182.111.64.7    41766',
              '59.32.37.118    8010',
              '114.225.169.36    53128',
              '59.173.74.169    8010',
              '121.31.155.184    8123',
              '121.31.192.215    8123',
              '175.175.216.210    1133',
              '13.121.242.49    808',
              '115.219.109.1    8010']
    print(random.choice(proxys))
    p = random.choice(proxys)
    ip = p.strip('\n').split('\t')
    proxy = 'http:\\' + ip[0] + ':' + ip[1]
    proxies = {'proxy': proxy}
    try:
        r = requests.get(url, headers=headers, proxies=proxies)
        soup = BeautifulSoup(r.text, "lxml")
        all_product = soup.find_all('div', attrs={'class': 'product'})
        name_list = []
        price_list = []
        client_list = []
        image_url_list = []
        for product in all_product:
            # 产品名称列表
            name = product.select('.productTitle a')[0].get_text()
            name_list.append(name)
            # 产品图片
            image_url = product.select('.productImg-wrap a img')[0].attrs['src']
            image_url_list.append('https:' + image_url)
            # 价格列表
            price =product.select('.productPrice em')[0].get_text()
            price_list.append(price)
            # 商家
            client =product.select('.productShop')[0].get_text().strip()
            client_list.append(client)
        # 打印抓取的列表
        print(name_list)
        print(price_list)
        print(client_list)
        print(image_url_list)
        print('查询成功！')
    except Exception as e:
        print(e)
        print('爬取网页失败！')