亚马逊爬虫 根据某个关键词,用爬虫抓取亚马逊商品的内容,并写入数据库

亚马逊爬虫

爬虫需求:根据某个关键词,用爬虫抓取亚马逊商品的内容,并写入数据库

1.逻辑,获取列表页的个数,
2.构造并获取列表页的url
3.构造获取详情页url列表的url
4.从详情页抽取需要的字段
import requests
from lxml import etree
import urllib3
import time
from Database import Database
import socket
import random
import json
import ssl
import os

ssl._create_default_https_context = ssl._create_unverified_context
urllib3.disable_warnings()
headers = {
   
    "authority": "www.amazon.com",
    "referer": "https://www.amazon.com/",
    "cookie": "session-id=135-9270034-7902044; session-id-time=2082787201l; i18n-prefs=USD; ubid-main=133-6329801-5373634; lc-main=en_US; x-amz-captcha-1=1611561400323067; x-amz-captcha-2=A5mJb102s77jmJPXHmDTkw==; session-token=NglmrU6O168Bqrx5lTGDGYMT/SEPDr9oHKh6tOadX2whsc9nbcGpv0Sq6IbsWH3HsZeM0356/n/4hEMfVHaSRZp9AbitEPua6hu2BJqjWUum8UbFtF0lPXlS0dBb4RdzqFtuQY038nDZ4HGb5ELj/13C2LDghkkYrJ8r8efe8FR2CctuJFol/fN11G5PIAQi; skin=noskin; csm-hit=tb:G1YDRWT651WPV3D9SX2V+s-Y79DQEGWVSS7F4EEJ7AR|1611623253884&t:1611623253884&adb:adblk_yes",
    "rtt": "150",
    "downlink": "9.7",
    "ect": "4g",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75",
    "path": "/s?k=canvas+print&ref=nb_sb_noss_1",
}


# 根据关键词构建url
def get_url(keyword):
    url = "https://www.amazon.com/s?k=" + keyword.replace(" ", "+") + "&page="
    return url


# 获取列表页个数
def get_page_url_num(url):
    res = requests.get(url, headers=headers, verify=False, timeout=100)
    html = etree.HTML(res.text)
    num = int(html.xpath('//ul[@class="a-pagination"]//li[last()-1]//text()')[0])
    return num


# 分页url 列表
def get_page_url_list(start_num, end_num, keyword):
    basic_url = "https://www.amazon.com/s?k=" + keyword.replace(" ", "+") + "&page="
    # basic_url = "https://www.amazon.com/s?k=water+shoes&page="
    page_url_list = []
    for i in range(start_num, end_num):
        url = basic_url + str(i + 1)
        page_url_list.append(url)
    return page_url_list


# 获取详情页url列表
def get_detail_url_dict(page_url_list):
    detail_url_list = []
    imag_url_list = []
    basic = "https://www.amazon.com"
    exception_detail_url_list = []
    for i in page_url_list:
        try:
            res = requests.get(i, headers=headers, verify=False, timeout=100)
            # print(res.text)
            time.sleep(random.randint(3, 10))
            res.close()
            socket.setdefaulttimeout(30)
            html = etree.HTML(res.text)
            detail_url = html.xpath(
                '//div[@class="a-section a-spacing-none a-spacing-top-small"]//a[@class="a-link-normal a-text-normal"]/@href')
            detail_url = [basic + i for i in detail_url]
            detail_url_list.append(detail_url)
            #             print(imag_url_list)
            imag_url = html.xpath(
                '//div[@class="a-section aok-relative s-image-tall-aspect"]/img/@src')
            if len(imag_url) != 0:
                pass
            else:
                imag_url = html.xpath('//div[@class="a-section aok-relative s-image-square-aspect"]/img/@src')
            imag_url_list.append(imag_url)
        #             print(detail_url_list)
        #             imag_file_name_list = html.xpath('//div[@class="a-section a-spacing-none a-spacing-top-small"]//h2//span/text()')
        #             img_dir = dict(zip(imag_file_name_list,imag_url_list))
        #         return img_dir
        except Exception as e:
            exception_detail_url_list.append(deta
  • 5
    点赞
  • 24
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值