亚马逊爬虫-python

找实习遇到的作业:

最终结果:

 

 实现代码分两部分:抓取书籍id,爬取详细数据

1:

import requests
import re
from pyquery import PyQuery as pq

#提取一个代理
def get_proxy():
    return str(requests.get("http://127.0.0.1:5010/get/").content)[2:-1]

#使用代理的requests请求
def url_open(url):
    header = {'User-Agent': 'Mozilla/5.0 ', 'X-Requested-With': 'XMLHttpRequest'}
    global proxy
    try:
        if proxy:
            print('正在使用代理', proxy)
            proxies = {'http':'http://'+proxy}
            #print(proxies)
            response = requests.get(url=url, headers=header, proxies=proxies)
        else:
            response = requests.get(url=url, headers=header)
        if response.status_code == 200:
            return response.text
        if response.status_code == 503:
            print('503')
            proxy = get_proxy()
            if proxy:
                return url_open(url)
            else:
                print('请求代理失败')
                return None
    except Exception:
        proxy=get_proxy()
        return url_open(url)

###########文学分类入口链接提取################
html='href="/s/ref=lp_144180071_nr_n_0fst=as%3Aoff&amp;rh=n%3A116087071%2Cn%3A%21116088071%2Cn%3A116169071%2Cn%3A144180071%2Cn%3A144201071&amp;bbn=144180071&amp;ie=UTF8&amp;qid=1533176532&amp;rnid=144180071"><span class="a-size-small a-color-base">文学名家</span></a></span></li><li><span class="a-list-item"><a class="a-link-normal s-ref-text-link" href="/s/ref=lp_144180071_nr_n_1?href="/s/ref=lp_144180071_nr_n_1?fst=as%3Aoff&amp;rh=n%3A116087071%2Cn%3A%21116088071%2Cn%3A116169071%2Cn%3A144180071%2Cn%3A144206071&amp;bbn=144180071&amp;ie=UTF8&amp;qid=1533176532&amp;rnid=144180071"><span class="a-size-small a-color-base">作品集</span></a></span></li><li><span class="a-list-item"><a class="a-link-normal s-ref-text-link" href="/s/ref=lp_144180071_nr_n_2?fst=as%3Aoff&amp;rh=n%3A116087071%2Cn%3A%21116088071%2Cn%3A116169071%2Cn%3A144180071%2Cn%3A144212071&amp;bbn=144180071&amp;ie=UTF8&amp;qid=1533176532&amp;rnid=144180071"><span class="a-size-small a-color-base">散文随笔</span></a></span></li><li><span class="a-list-item"><a class="a-link-normal s-ref-text-link" href="/s/ref=lp_144180071_nr_n_3?fst=as%3Aoff&amp;rh=n%3A116087071%2Cn%3A%21116088071%2Cn%3A116169071%2Cn%3A144180071%2Cn%3A144222071&amp;bbn=144180071&amp;ie=UTF8&amp;qid=1533176532&amp;rnid=144180071"><span class="a-size-small a-color-base">诗歌词曲</span></a></span></li><li><span class="a-list-item"><a class="a-link-normal s-ref-text-link" href="/s/ref=lp_144180071_nr_n_4?fst=as%3Aoff&amp;rh=n%3A116087071%2Cn%3A%21116088071%2Cn%3A116169071%2Cn%3A144180071%2Cn%3A144235071&amp;bbn=144180071&amp;ie=UTF8&amp;qid=1533176532&amp;rnid=144180071"><span class="a-size-small a-color-base">民间文学</span></a></span></li><li><span class="a-list-item"><a class="a-link-normal s-ref-text-link" href="/s/ref=lp_144180071_nr_n_5?fst=as%3Aoff&amp;rh=n%3A116087071%2Cn%3A%21116088071%2Cn%3A116169071%2Cn%3A144180071%2Cn%3A144228071&amp;bbn=144180071&amp;ie=UTF8&amp;qid=1533176532&amp;rnid=144180071"><span class="a-size-small a-color-base">纪实文学</span></a></span></li><li><span class="a-list-item"><a class="a-link-normal s-ref-text-link" href="/s/ref=lp_144180071_nr_n_6?fst=as%3Aoff&amp;rh=n%3A116087071%2Cn%3A%21116088071%2Cn%3A116169071%2Cn%3A144180071%2Cn%3A144218071&amp;bbn=144180071&amp;ie=UTF8&amp;qid=1533176532&amp;rnid=144180071"><span class="a-size-small a-color-base">影视文学</span></a></span></li><li><span class="a-list-item"><a class="a-link-normal s-ref-text-link" href="/s/ref=lp_144180071_nr_n_7?fst=as%3Aoff&amp;rh=n%3A116087071%2Cn%3A%21116088071%2Cn%3A116169071%2Cn%3A144180071%2Cn%3A144234071&amp;bbn=144180071&amp;ie=UTF8&amp;qid=1533176532&amp;rnid=144180071"><span class="a-size-small a-color-base">戏剧与曲艺</span></a></span></li><li><span class="a-list-item"><a class="a-link-normal s-ref-text-link" href="/s/ref=lp_144180071_nr_n_8?fst=as%3Aoff&amp;rh=n%3A116087071%2Cn%3A%21116088071%2Cn%3A116169071%2Cn%3A144180071%2Cn%3A144200071&amp;bbn=144180071&amp;ie=UTF8&amp;qid=1533176532&amp;rnid=144180071"><span class="a-size-small a-color-base">文学史</span></a></span></li><li><span class="a-list-item"><a class="a-link-normal s-ref-text-link" href="/s/ref=lp_144180071_nr_n_9?fst=as%3Aoff&amp;rh=n%3A116087071%2Cn%3A%21116088071%2Cn%3A116169071%2Cn%3A144180071%2Cn%3A144181071&amp;bbn=144180071&amp;ie=UTF8&amp;qid=1533176532&amp;rnid=144180071"><span class="a-size-small a-color-base">文学理论</span></a></span></li><li><span class="a-list-item"><a class="a-link-normal s-ref-text-link" href="/s/ref=lp_144180071_nr_n_10?fst=as%3Aoff&amp;rh=n%3A116087071%2Cn%3A%21116088071%2Cn%3A116169071%2Cn%3A144180071%2Cn%3A144187071&amp;bbn=144180071&amp;ie=UTF8&amp;qid=1533176532&amp;rnid=144180071"><span class="a-size-small a-color-base">文学评论与鉴赏</span></a></span></li><li><span class="a-list-item"><a class="a-link-normal s-ref-text-link" href="/s/ref=lp_144180071_nr_n_11?fst=as%3Aoff&amp;rh=n%3A116087071%2Cn%3A%21116088071%2Cn%3A116169071%2Cn%3A144180071%2Cn%3A144242071&amp;bbn=144180071&amp;ie=UTF8&amp;qid=1533176532&amp;rnid=144180071"><span class="a-size-small a-color-base">期刊杂志</span></a></span></li><li><span class="a-list-item"><a class="a-link-normal s-ref-text-link" href="/s/ref=lp_144180071_nr_n_12?fst=as%3Aoff&amp;rh=n%3A116087071%2Cn%3A%21116088071%2Cn%3A116169071%2Cn%3A144180071%2Cn%3A144243071&amp;bbn=144180071&amp;ie=UTF8&amp;qid=1533176532&amp;rnid=144180071"><span class="a-size-small a-color-base">文学作品导读'
doc=pq(html)
pages_list=[]
for each in re.findall('rh=(.*?)&amp',html):
    pages_list.append('https://www.amazon.cn/s/rh='+each)

count=0 #用作txt文件名
asin_re=re.compile('data-asin="(.*?)" class') #用正则解析book_asin
for page_url in pages_list:
    print(page_url)
    html = url_open(page_url)
    doc = pq(html)
    if doc('#pagn > span.pagnDisabled').text():
        page_count=int(doc('#pagn > span.pagnDisabled').text())  #解析该类下面有多少页,若出错,则设为400页
    else:page_count=400
    count += 1
    with open(str(count)+'.txt','a',encoding='utf-8')as f:  #创建txt文件
        err_count=0
        for i in range(1, page_count + 1):
            print('正在爬取第%dy页的book_asin' % i)
            url = page_url + '&page='+str(i)
            html = url_open(url)
            print(url)
            if html!=None:
                err_count=0
                if err_count>=20:  #在前面解析该类下面有多少页出错导致访问空页面时,超过20次即认为已经爬完该分类,跳出循环
                    break
                data_asin = re.findall(asin_re, html)
                print(data_asin)

                for each in data_asin:    #写入文件
                    f.write(each)
                    f.write('\n')
            else: err_count+=1

 

2:

import requests
from fake_useragent import UserAgent
import pymysql
from multiprocessing import Process,Queue,Lock
from pyquery import PyQuery as pq
import time
import random


ua = UserAgent()   #实例化,后文用它生成随机游览器请求头

# #调试排查问题所用
# def get(url,i=2):
#     headers = {
#         'Accept': 'text/html,*/*',
#         'Accept-Encoding': 'gzip, deflate, br',
#         'Accept-Language': 'zh-CN,zh;q=0.9',
#         'Connection': 'keep-alive',
#     
#         'Host': 'www.amazon.cn',
#         'Referer': 'https://www.amazon.cn/gp/aw/s/ref=is_pn_1?rh=n%3A658390051%2Cn%3A%21658391051%2Cn%3A658394051%2Cn%3A658509051&page=1',
#         'User-Agent': ua.random,
#         'X-Requested-With': 'XMLHttpRequest'
#     }
#     if i>0:
#         try:
#             response = requests.get(url=url, headers=headers,timeout=1)
#             print(response.status_code)
#             response.encoding='utf-8'
#             return response.text
#         except :
#             get(url, i=i - 1)
#     else:return None

def get_proxy():
    return str(requests.get("http://127.0.0.1:5010/get/").content)[2:-1]

def title_parse(title): #由于amazon抓取下来的书籍标题太长,需要截取一下
    jd_title = []
    for each in title:
        if each != "(":
            jd_title.append(each)
        else:
            break

    jd_title = ''.join(jd_title)
    return jd_title

def price_parse(price):  #处理一下amazon价格
    amazon_price=[]
    for each in price:
        if each != "¥":
            amazon_price.append(each)
        else:
            break

    amazon_price = ''.join(amazon_price)
    return amazon_price

#亚马逊请求函数
def url_open1(url):
    header = {
        'Accept': 'text/html,*/*',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'Host': 'www.amazon.cn',
        'Referer': 'https: // www.amazon.cn /',
        'User-Agent': ua.random,
        'X-Requested-With': 'XMLHttpRequest'
    }
    global proxy
    try:
        if proxy:
            print('正在使用代理', proxy)
            proxies = {'http':'http://'+proxy}
            #print(proxies)
            response = requests.get(url=url, headers=header, proxies=proxies)
        else:
            response = requests.get(url=url, headers=header)
        if response.status_code == 200:
            response.encoding='utf-8'
            return response.text
        if response.status_code == 503:
            print('503')
            proxy = get_proxy()
            if proxy:
                return url_open1(url)
            else:
                print('请求代理失败')
                return None
    except Exception:
        proxy=get_proxy()
        return url_open1(url)

#京东请求函数
def url_open2(url):
    header = {
        'User-Agent': ua.random,
    }
    global proxy
    try:
        if proxy:
            print('正在使用代理', proxy)
            proxies = {'http': 'http://' + proxy}
            # print(proxies)
            response = requests.get(url=url, headers=header, proxies=proxies)
        else:
            response = requests.get(url=url, headers=header)
        if response.status_code == 200:
            response.encoding = 'utf-8'
            return response.text
        if response.status_code == 503:
            print('503')
            proxy = get_proxy()
            if proxy:
                return url_open2(url)
            else:
                print('请求代理失败')
                return None
    except Exception:
        proxy = get_proxy()
        return url_open2(url)

#核心的蜘蛛了,承担了解析Amazon和JD详情页以及之后的存储数据功能
def spider(q,lock):
    #操作MySQL
    conn = pymysql.connect(host='localhost', port=3306, user='root', password='******', db='amazon', charset='utf8')
    cursor = conn.cursor()
    while True:
        lock.acquire()
        asin = q.get(block=False)[:-1]
        lock.release()
        url = 'https://www.amazon.cn/gp/product/{a}'.format(a=asin)
        print(url)
        html = url_open1(url)
        if html==None: #有时候返回None,此语句防崩溃
            continue
        doc = pq(html)
        title = doc('#ebooksProductTitle.a-size-extra-large').text()  # 书名
        amazon_price = doc('a .a-size-small.a-color-price').text()[1:]  # 纸质书价格(人民币)
        amazon_price=price_parse(amazon_price)
        #e_price = doc('#tmmSwatches > ul > li.swatchElement.selected > span > span:nth-child(4) > span > a').text()[1:-2]  # 电子书价格
        amazon_comments = doc('#acrCustomerReviewText.a-size-base').text()[:-5]  # 评论数
        jd_search_title = title_parse(title)
        url = 'https://search.jd.com/Search?keyword={a}&enc=utf-8'.format(a=jd_search_title)
        html = url_open2(url)
        if html==None:
            continue
        doc = pq(html)
        jd_price = doc('#J_goodsList > ul > li:nth-child(1) > div > div.p-price > strong > i').text() #价格
        its = doc('.gl-warp.clearfix li div .p-commit strong a').items()  #评论数有点麻烦
        try:  #防止生成器为空调用next报错
            its.__next__() #因为所需的数据在生成器的第二项,所以先调用一次next
            jd_comments = its.__next__().text()
        except:
            jd_comments=None
        print(amazon_comments, amazon_price, title)
        print(jd_price,jd_comments)
        date=time.strftime("%Y-%m-%d", time.localtime())  #抓取日期

        #存入mysql
        cursor.execute("INSERT INTO data(book_asin,title,amazon_price,amazon_comments,jd_price,jd_comments,update_date) VALUES ('{0}','{1}','{2}','{3}','{4}','{5}','{6}');".format(asin,title,amazon_price,amazon_comments,jd_price,jd_comments,date))
        conn.commit()

        time.sleep(random.random())       #延迟0~1秒
    conn.close()


if __name__=='__main__':
    q = Queue()    #多线程,数据量不多,用队列通信
    lock = Lock()
    with open('asin.txt', 'r')as f:
        AsinList = f.readlines()
    for each in AsinList[6000:]:        #老是被503,修该列表尽可能避免重复抓取
        q.put(each)

    #多线程一下子很快,但一小会就被封
    p1 = Process(target=spider, args=(q, lock))
    # p2 = Process(target=spider, args=(q, lock))
    # p3 = Process(target=spider, args=(q, lock))
    # p4 = Process(target=spider, args=(q, lock))
    # p5 = Process(target=spider, args=(q, lock))
    # p6 = Process(target=spider, args=(q, lock))
    # p7 = Process(target=spider, args=(q, lock))
    # p8 = Process(target=spider, args=(q, lock))
    p1.start(), \
    # p2.start(), p3.start(), p4.start(), p5.start(), p6.start(), p7.start(), p8.start()
    p1.join(),\
    # p2.join(),p3.join(), p4.join(), p5.join(), p6.join(), p7.join(), p8.join()


 

 

  • 4
    点赞
  • 19
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论
要进行Python亚马逊爬虫,你可以使用Selenium库或者urllib库来实现。 使用Selenium库可以模拟浏览器操作,具体步骤如下: 1. 安装Selenium库,并下载对应浏览器的WebDriver(如ChromeDriver)。 2. 导入Selenium库。 3. 创建浏览器对象并指定WebDriver的路径。 4. 使用浏览器对象打开亚马逊网站。 5. 使用Selenium的相关方法来定位和获取页面元素,例如输入框、按钮等。 6. 使用Selenium的方法来模拟点击、输入等操作。 7. 使用Selenium的方法来获取页面源代码或者特定元素的信息。 8. 关闭浏览器对象。 使用urllib库可以发送HTTP请求获取亚马逊页面的源代码,具体步骤如下: 1. 导入urllib库。 2. 使用urllib库的`urlopen()`函数发送GET请求,并指定目标网址。 3. 使用`read()`方法读取响应内容。 4. 对响应内容进行解码(如果需要)。 5. 对解码后的内容进行解析,可以使用BeautifulSoup等库来提取页面元素。 6. 关闭连接。 请注意,亚马逊网站有反爬虫机制,因此你可能需要使用代理、设置请求头、处理验证码等措施来避免被检测和封禁。 参考代码示例(基于Selenium): ``` from selenium import webdriver # 设置ChromeDriver的路径 chrome_driver_path = "path_to_chromedriver" # 创建Chrome浏览器对象 browser = webdriver.Chrome(chrome_driver_path) # 打开亚马逊网站 url = "https://www.amazon.com" browser.get(url) # 在搜索框中输入关键字 search_input = browser.find_element_by_id("twotabsearchtextbox") search_input.send_keys("python") # 点击搜索按钮 search_button = browser.find_element_by_xpath("//input[@value='Go']") search_button.click() # 获取搜索结果 search_results = browser.find_elements_by_xpath("//h2[@class='a-size-mini']") for result in search_results: print(result.text) # 关闭浏览器 browser.quit() ``` 参考代码示例(基于urllib): ``` import urllib.request # 发送GET请求并获取响应 url = "https://www.amazon.com" response = urllib.request.urlopen(url) # 读取响应内容 content = response.read().decode("utf-8") print(content) # 解析页面内容,提取相关信息 # ... # 关闭连接 response.close() ``` 希望以上信息对你有所帮助!
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

ColaForced

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值