import os
import re
import threading
import time,datetime
import csv
from lxml import etree
import pandas as pd
import pyodbc
import requests
from pybloom_live import BloomFilter
from fake_useragent import UserAgent
#from google_translate import google_translate_EtoC as gt
DATABASE = '一带一路book'
#COUNTRY = 'China'
SOURCE='https://www.amazon.cn/'
COUNT = 5
def Bulon():
if os.path.exists('{}.blm'.format(DATABASE)):
bf =BloomFilter.fromfile(open('{}.blm'.format(DATABASE),'rb'))
else:
bf = BloomFilter(1000000,0.001)
return bf
bf=Bulon()
def save_to_csv(book_name,author,content,information_1,information_2,information_3,information_4,information_5,information_6,information_7,information_8,information_9,information_10,information_11,information_12,description_1,description_2,description_3,description_4,description_5,description_6,description_7):
row = [book_name,author,content,information_1,information_2,information_3,information_4,information_5,information_6,information_7,information_8,information_9,information_10,information_11,information_12,description_1,description_2,description_3,description_4,description_5,description_6,description_7]
with open(r'D:\update\book.csv', 'a', newline='', encoding='utf-8') as file:
f = csv.writer(file)
f.writerow(row)
def parse(COUNT, header, url):
while COUNT:
try:
response = requests.get(url, headers=header,timeout=20)
if response.status_code == 200:
return response
else:
COUNT -= 1
except:
COUNT -= 1
if COUNT == 0:
return 0
def dd(url, header):
print('url is'+ url)
response = parse(COUNT, header, url)
if response:
selector = etree.HTML(response.content)
try:
book_name = selector.xpath('//div[@class="a-section a-spacing-none"]//h1//span[@id="productTitle"]//text()')[0].strip().replace("'", "''")
except:
book_name = ''
try:
author = ','.join(selector.xpath('//span[@class="author notFaded"]//a//text()')).strip().replace("'", "''")
except:
author = ''
try:
content = ''.join(selector.xpath('//*[@id="iframeContent"]/text()').strip().replace("'", "''"))
except:
content = ''
try:
information_1 = ''.join(selector.xpath('//td[@class="bucket"]/div/ul/li[1]/text()').strip().replace("'", "''"))
except:
information_1 = ''
try:
information_2 = ''.join(selector.xpath('//td[@class="bucket"]/div/ul/li[2]//text()').strip().replace("'", "''"))
except:
information_2 = ''
try:
information_3 = ''.join(selector.xpath('//td[@class="bucket"]/div/ul/li[3]//text()').strip().replace("'", "''"))
except:
information_3 = ''
try:
information_4 = ''.join(selector.xpath('//td[@class="bucket"]/div/ul/li[4]//text()').strip().replace("'", "''"))
except:
information_4 = ''
try:
information_5 = ''.join(selector.xpath('//td[@class="bucket"]/div/ul/li[5]//text()').strip().replace("'", "''"))
except:
information_5 = ''
try:
information_6 = ''.join(selector.xpath('//td[@class="bucket"]/div/ul/li[6]//text()').strip().replace("'", "''"))
except:
information_6 = ''
try:
information_7 = ''.join(selector.xpath('//td[@class="bucket"]/div/ul/li[7]//text()').strip().replace("'", "''"))
except:
information_7 = ''
try:
information_8 = ''.join(selector.xpath('//td[@class="bucket"]/div/ul/li[8]//text()').strip().replace("'", "''"))
except:
information_8 = ''
try:
information_9 = ''.join(selector.xpath('//td[@class="bucket"]/div/ul/li[9]//text()').strip().replace("'", "''"))
except:
information_9 = ''
try:
information_10 = ''.join(selector.xpath('//td[@class="bucket"]/div/ul/li[10]//text()').strip().replace("'", "''"))
except:
information_10 = ''
try:
information_11 = ''.join(selector.xpath('//td[@class="bucket"]/div/ul/li[11]//text()').strip().replace("'", "''"))
except:
information_11 = ''
try:
information_12 = ''.join(selector.xpath('//td[@class="bucket"]/div/ul/li[12]//text()').strip().replace("'", "''"))
except:
information_12 = ''
# description
try:
description_1 = ''.join(selector.xpath('//*[@id="s_content_0"]/p/text()').strip().replace("'", "''"))
except:
description_1 = ''
try:
description_2 = ''.join(selector.xpath('//*[@id="s_content_2"]/p/text()').strip().replace("'", "''"))
except:
description_2 = ''
try:
description_3 = ''.join(selector.xpath('//*[@id="s_content_3"]/p/text()').strip().replace("'", "''"))
except:
description_3 = ''
try:
description_4 = ''.join(selector.xpath('//*[@id="s_content_4"]/p/text()').strip().replace("'", "''"))
except:
description_4 = ''
try:
description_5 = ''.join(selector.xpath('//*[@id="s_content_5"]/p/text()').strip().replace("'", "''"))
except:
description_5 = ''
try:
description_6 = ''.join(selector.xpath('//*[@id="s_content_6"]/p/text()').strip().replace("'", "''"))
except:
description_6 = ''
try:
description_7 = ''.join(selector.xpath('//*[@id="s_content_7"]/p/text()').strip().replace("'", "''"))
except:
description_7 = ''
print(book_name+'......'+url)
save_to_csv(book_name,author,content,information_1,information_2,information_3,information_4,information_5,information_6,information_7,information_8,information_9,information_10,information_11,information_12,description_1,description_2,description_3,description_4,description_5,description_6,description_7)
def spider():
header = {'User-Agent': UserAgent().random, 'Accept-Language': 'zh-CN,zh;q=0.9'}
for i in range(1,71+1):
main_url = "https://www.amazon.cn/s/ref=sr_pg_2?rh=n%3A658390051%2Ck%3A%E4%B8%80%E5%B8%A6%E4%B8%80%E8%B7%AF&page="+str(i)+"&keywords=%E4%B8%80%E5%B8%A6%E4%B8%80%E8%B7%AF&ie=UTF8&qid=1532312722"
main_response = parse(COUNT, header, main_url)
main_selector = etree.HTML(main_response.text)
urls = main_selector.xpath('//ul//li//div[@class="a-row a-spacing-none"]//a[@class="a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal"]/@href')
# thread_list = []
for url_final in urls:
if url_final in bf:
print('yeah_continue!!!!!!!!!!!!!!!!')
continue
else:
bf.add(url_final)
bf.tofile(open('{}.blm'.format(DATABASE), 'wb'))
# try:
# t = threading.Thread(target=dd, args=(url_final, header))
# t.setDaemon(True)
# thread_list.append(t)
# except:
# pass
# if not thread_list:
# continue
# for t in thread_list:
# t.start()
# for t in thread_list:
# t.join(20)
dd(url_final,header)
#url_end 被我改成urls
# thread_list=[]
# for url in url_ends:
# url_final=SOURCE+url
# if url_final in bf:
# print('yeah_continue!!!!!!!!!!!!!!!!')
# continue
# else:
# bf.add(url_final)
# bf.tofile(open('{}.blm'.format(DATABASE), 'wb'))
# t = threading.Thread(target=dd, args=(url_final, header))
# t.setDaemon(True)
#
if __name__ == '__main__':
spider()
亚马逊书籍
最新推荐文章于 2022-05-16 12:51:17 发布