亚马逊图书爬虫实践-CSDN博客

本文链接：https://blog.csdn.net/qq_42717902/article/details/82624302
import os
import re
import threading
import time,datetime
import csv
from lxml import etree
import pandas as pd
import pyodbc
import requests
from pybloom_live import BloomFilter
from fake_useragent import UserAgent
#from google_translate import google_translate_EtoC as gt


DATABASE = '一带一路book'
#COUNTRY = 'China'
SOURCE='https://www.amazon.cn/'
COUNT = 5

def Bulon():
    if os.path.exists('{}.blm'.format(DATABASE)):
        bf =BloomFilter.fromfile(open('{}.blm'.format(DATABASE),'rb'))
    else:
        bf = BloomFilter(1000000,0.001)
    return bf

bf=Bulon()
def save_to_csv(book_name,author,content,information_1,information_2,information_3,information_4,information_5,information_6,information_7,information_8,information_9,information_10,information_11,information_12,description_1,description_2,description_3,description_4,description_5,description_6,description_7):
    row = [book_name,author,content,information_1,information_2,information_3,information_4,information_5,information_6,information_7,information_8,information_9,information_10,information_11,information_12,description_1,description_2,description_3,description_4,description_5,description_6,description_7]
    with open(r'D:\update\book.csv', 'a', newline='', encoding='utf-8') as file:
        f = csv.writer(file)        
        f.writerow(row)
def parse(COUNT, header, url):
    while COUNT:
        try:
            response = requests.get(url, headers=header,timeout=20)
            if response.status_code == 200:
                return response
            else:
                COUNT -= 1
        except:
            COUNT -= 1
        if COUNT == 0:
            return 0
def dd(url, header):
    print('url is'+ url)
    response = parse(COUNT, header, url)

    if response:
        selector = etree.HTML(response.content)

        try:
            book_name = selector.xpath('//div[@class="a-section a-spacing-none"]//h1//span[@id="productTitle"]//text()')[0].strip().replace("'", "''")
        except:
            book_name = ''
        try:
            author = ','.join(selector.xpath('//span[@class="author notFaded"]//a//text()')).strip().replace("'", "''")            
        except:
            author = ''
        try:
            content = ''.join(selector.xpath('//*[@id="iframeContent"]/text()').strip().replace("'", "''"))
        except:
            content = ''
        try:
            information_1 = ''.join(selector.xpath('//td[@class="bucket"]/div/ul/li[1]/text()').strip().replace("'", "''"))
        except:
            information_1 = ''
        try:
            information_2 = ''.join(selector.xpath('//td[@class="bucket"]/div/ul/li[2]//text()').strip().replace("'", "''"))
        except:
            information_2 = ''
        try:
            information_3 = ''.join(selector.xpath('//td[@class="bucket"]/div/ul/li[3]//text()').strip().replace("'", "''"))
        except:
            information_3 = '' 
        try:
            information_4 = ''.join(selector.xpath('//td[@class="bucket"]/div/ul/li[4]//text()').strip().replace("'", "''"))
        except:
            information_4 = '' 
        try:
            information_5 = ''.join(selector.xpath('//td[@class="bucket"]/div/ul/li[5]//text()').strip().replace("'", "''"))
        except:
            information_5 = ''
        try:
            information_6 = ''.join(selector.xpath('//td[@class="bucket"]/div/ul/li[6]//text()').strip().replace("'", "''"))
        except:
            information_6 = '' 
        try:
            information_7 = ''.join(selector.xpath('//td[@class="bucket"]/div/ul/li[7]//text()').strip().replace("'", "''"))
        except:
            information_7 = '' 
        try:
            information_8 = ''.join(selector.xpath('//td[@class="bucket"]/div/ul/li[8]//text()').strip().replace("'", "''"))
        except:
            information_8 = '' 
        try:
            information_9 = ''.join(selector.xpath('//td[@class="bucket"]/div/ul/li[9]//text()').strip().replace("'", "''"))
        except:
            information_9 = ''    
        try:
            information_10 = ''.join(selector.xpath('//td[@class="bucket"]/div/ul/li[10]//text()').strip().replace("'", "''"))
        except:
            information_10 = ''
        try:
            information_11 = ''.join(selector.xpath('//td[@class="bucket"]/div/ul/li[11]//text()').strip().replace("'", "''"))
        except:
            information_11 = '' 
        try:
            information_12 = ''.join(selector.xpath('//td[@class="bucket"]/div/ul/li[12]//text()').strip().replace("'", "''"))
        except:
            information_12 = '' 
#  description    
        try:
            description_1 = ''.join(selector.xpath('//*[@id="s_content_0"]/p/text()').strip().replace("'", "''"))
        except:
            description_1 = '' 
        try:
            description_2 = ''.join(selector.xpath('//*[@id="s_content_2"]/p/text()').strip().replace("'", "''"))
        except:
            description_2 = ''      
        try:
            description_3 = ''.join(selector.xpath('//*[@id="s_content_3"]/p/text()').strip().replace("'", "''"))
        except:
            description_3 = ''             
        try:
            description_4 = ''.join(selector.xpath('//*[@id="s_content_4"]/p/text()').strip().replace("'", "''"))
        except:
            description_4 = ''             
        try:
            description_5 = ''.join(selector.xpath('//*[@id="s_content_5"]/p/text()').strip().replace("'", "''"))
        except:
            description_5 = ''     
        try:
            description_6 = ''.join(selector.xpath('//*[@id="s_content_6"]/p/text()').strip().replace("'", "''"))
        except:
            description_6 = ''        
        try:
            description_7 = ''.join(selector.xpath('//*[@id="s_content_7"]/p/text()').strip().replace("'", "''"))
        except:
            description_7 = '' 
        print(book_name+'......'+url) 
        save_to_csv(book_name,author,content,information_1,information_2,information_3,information_4,information_5,information_6,information_7,information_8,information_9,information_10,information_11,information_12,description_1,description_2,description_3,description_4,description_5,description_6,description_7)          
def spider():
    header = {'User-Agent': UserAgent().random, 'Accept-Language': 'zh-CN,zh;q=0.9'}    
    for i in range(1,71+1):
            main_url = "https://www.amazon.cn/s/ref=sr_pg_2?rh=n%3A658390051%2Ck%3A%E4%B8%80%E5%B8%A6%E4%B8%80%E8%B7%AF&page="+str(i)+"&keywords=%E4%B8%80%E5%B8%A6%E4%B8%80%E8%B7%AF&ie=UTF8&qid=1532312722"                   
            main_response = parse(COUNT, header, main_url)
            main_selector = etree.HTML(main_response.text)
            urls = main_selector.xpath('//ul//li//div[@class="a-row a-spacing-none"]//a[@class="a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal"]/@href')
#            thread_list = []
            for url_final in urls:

                if url_final in bf:
                    print('yeah_continue!!!!!!!!!!!!!!!!')
                    continue
                else: 
                    bf.add(url_final)
                    bf.tofile(open('{}.blm'.format(DATABASE), 'wb'))
#                try:   
#                    t = threading.Thread(target=dd, args=(url_final, header))
#                    t.setDaemon(True)
#                    thread_list.append(t)
#                except:
#                    pass
#            if not thread_list:
#                continue
#            for t in thread_list:
#                t.start()
#            for t in thread_list:
#                t.join(20)
                dd(url_final,header)


            #url_end 被我改成urls
#            thread_list=[]
#            for url in url_ends:
#                url_final=SOURCE+url
#                if url_final in bf:
#                    print('yeah_continue!!!!!!!!!!!!!!!!')
#                    continue
#                else:
#                    bf.add(url_final)
#                    bf.tofile(open('{}.blm'.format(DATABASE), 'wb'))
#                    t = threading.Thread(target=dd, args=(url_final, header))
#                    t.setDaemon(True)
#          


if __name__ == '__main__':
    spider()
亚马逊书籍