亚马逊爬虫
爬虫需求:根据某个关键词,用爬虫抓取亚马逊商品的内容,并写入数据库
1.逻辑,获取列表页的个数,
2.构造并获取列表页的url
3.构造获取详情页url列表的url
4.从详情页抽取需要的字段
import requests
from lxml import etree
import urllib3
import time
from Database import Database
import socket
import random
import json
import ssl
import os
ssl._create_default_https_context = ssl._create_unverified_context
urllib3.disable_warnings()
headers = {
"authority": "www.amazon.com",
"referer": "https://www.amazon.com/",
"cookie": "session-id=135-9270034-7902044; session-id-time=2082787201l; i18n-prefs=USD; ubid-main=133-6329801-5373634; lc-main=en_US; x-amz-captcha-1=1611561400323067; x-amz-captcha-2=A5mJb102s77jmJPXHmDTkw==; session-token=NglmrU6O168Bqrx5lTGDGYMT/SEPDr9oHKh6tOadX2whsc9nbcGpv0Sq6IbsWH3HsZeM0356/n/4hEMfVHaSRZp9AbitEPua6hu2BJqjWUum8UbFtF0lPXlS0dBb4RdzqFtuQY038nDZ4HGb5ELj/13C2LDghkkYrJ8r8efe8FR2CctuJFol/fN11G5PIAQi; skin=noskin; csm-hit=tb:G1YDRWT651WPV3D9SX2V+s-Y79DQEGWVSS7F4EEJ7AR|1611623253884&t:1611623253884&adb:adblk_yes",
"rtt": "150",
"downlink": "9.7",
"ect": "4g",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75",
"path": "/s?k=canvas+print&ref=nb_sb_noss_1",
}
def get_url(keyword):
url = "https://www.amazon.com/s?k=" + keyword.replace(" ", "+") + "&page="
return url
def get_page_url_num(url):
res = requests.get(url, headers=headers, verify=False, timeout=100)
html = etree.HTML(res.text)
num = int(html.xpath('//ul[@class="a-pagination"]//li[last()-1]//text()')[0])
return num
def get_page_url_list(start_num, end_num, keyword):
basic_url = "https://www.amazon.com/s?k=" + keyword.replace(" ", "+") + "&page="
page_url_list = []
for i in range(start_num, end_num):
url = basic_url + str(i + 1)
page_url_list.append(url)
return page_url_list
def get_detail_url_dict(page_url_list):
detail_url_list = []
imag_url_list = []
basic = "https://www.amazon.com"
exception_detail_url_list = []
for i in page_url_list:
try:
res = requests.get(i, headers=headers, verify=False, timeout=100)
time.sleep(random.randint(3, 10))
res.close()
socket.setdefaulttimeout(30)
html = etree.HTML(res.text)
detail_url = html.xpath(
'//div[@class="a-section a-spacing-none a-spacing-top-small"]//a[@class="a-link-normal a-text-normal"]/@href')
detail_url = [basic + i for i in detail_url]
detail_url_list.append(detail_url)
imag_url = html.xpath(
'//div[@class="a-section aok-relative s-image-tall-aspect"]/img/@src')
if len(imag_url) != 0:
pass
else:
imag_url = html.xpath('//div[@class="a-section aok-relative s-image-square-aspect"]/img/@src')
imag_url_list.append(imag_url)
except Exception as e:
exception_detail_url_list.append(deta