亚马逊商品采集 爬虫

亚马逊的商品爬虫,需求是设置关键词,爬取前100个商品的图片和描述。

  一开始,尝试用python+requests正常处理。访问第一个商品就报错了。报错有两种,一种是网络错误,另一种是显示Type the characters you see 就是出验证码的情况。

最后我用了selenium调用chrome来访问,至少短时间是不会被封了。

  其他的问题就是如何excel插入图片,excel的宽度要换算成像素

需要在电脑环境先安装好下列包

附py1.py代码,注意:要先建立KEYWORD.txt文件,还要先建立tmp文件夹

# -*- coding: utf-8 -*-

# 德国amazon

from urllib.parse import quote, unquote
from selenium_operate import ChromeOperate

from crawl_tool_for_py3 import crawlerTool as ct

import re
import time
import requests,urllib
import datetime
import os,base64
import xlsxwriter
from PIL import Image
DOMAIN = 'https://www.amazon.de'

HEADERS = { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_1_1 like Mac OS X) AppleWebKit/602.2.14 (KHTML, like Gecko) Mobile/14B100 MicroMessenger/6.3.22 NetType/WIFI Language/zh_CN'
            }
se = requests.session()

def img_resize(infile,outfile):
    im = Image.open(infile)
    # (x, y) = im.size  # read image size
    x_s = 120  # define standard width
    y_s = 160  # calc height based on standard width
    out = im.resize((x_s, y_s), Image.ANTIALIAS)  # resize image with high-quality
    out.save(outfile)


def gen_xls(item_infos):
    timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    book = xlsxwriter.Workbook('amazon%s.xlsx'%timestamp)
    worksheet = book.add_worksheet('demo')
    worksheet.write_row(0,0, ['关键词','排名','宝贝图片','价格','宝贝类目','宝贝描述','宝贝链接'])
    worksheet.set_column('A:D', 15) # 列宽约等于8像素 行高约等于1.37像素
    worksheet.set_column('C:C', 20)
    worksheet.set_column('B:B', 10)
    worksheet.set_column('F:F', 50)
    for i in range(len(item_infos)):
        col = i+1
        try:
            item_info = item_infos[i]
            row =   [item_info['keyword'],item_info['rank'],'',item_info['price'],item_info['cat'],item_info['descriptions'],item_info['item_url']]
            worksheet.write_row(col,0, row)
            worksheet.set_row(col, 120)
            if 'item_pic_base64' in item_info:
                item_pic_base64 = item_info["item_pic_base64"]
                try:
                    if 'https:' in item_pic_base64:
                        data = ct.get(item_pic_base64)
                    else:
                        data = base64.b64decode(item_pic_base64)
                    with open('test.png', 'wb') as f:
                        f.write(data)
                    img_resize('test.png', 'img/tmp%s.png'%i)
                    worksheet.insert_image( col,2, 'img/tmp%s.png'%i) # 名字必须不同
                except Exception as e:
                    print(str(e))
        except Exception as e:
            print(str(e))
    print('完成结果数,%s'%col)
    book.close()


def extractor_page(page): # 解析宝贝页
    item_info = {"descriptions":""}
    descriptions = ct.getXpath('//div[@id="productDescription"]/p/text()',page)
    if not descriptions:
        descriptions = ct.getXpath( '//div[@id="aplus"]/div//p//text()', page)
    descriptions= ''.join([description.strip() for description in descriptions])
    item_info["descriptions"] = descriptions
    item_pic_base64 = ct.getXpath1( '//div[@id="imgTagWrapperId"]/img/@src', page).split('base64,')[-1]
    item_info["item_pic_base64"] = item_pic_base64
    price = ct.getXpath1( '//span[@id="priceblock_ourprice"]/text()', page)
    item_info["price"] = price
    cats =  ct.getXpath( '//div[@id="wayfinding-breadcrumbs_container"]//a/text()', page)
    item_info["cat"] = '/'.join([cat.strip() for cat in cats])
    for k in item_info:
        print(k)
    return item_info

if __name__ == '__main__':
 
    csv_rows=[]
    cookie = {}
    item_infos = []
    cop = ChromeOperate(executable_path=r'chromedriver.exe')
    cop.open(DOMAIN)
    with open('keywords.txt','r') as keyword_file:
        for line in keyword_file:
            line = line.strip()
            if not line:
                continue
            urls = [DOMAIN+'/s?k=%s&ref=nb_sb_noss_2'%quote(line),
                    # 'https://www.amazon.com/s?k=%s&ref=nb_sb_noss_2&page=2 ' % quote(line)
                    ]
            rank = 0
            for url in urls:
                # HEADERS.update({"Referer":url,"User-Agent":random.choice(USER_AGENT_POOL)})
                cop.open(url)
                page = cop.open_source()
                item_urls = ct.getXpath('//div[@class="sg-row"]//div[@class="sg-col-inner"]//h2/a/@href',page)
                if not item_urls:
                    print(page)
                for item_url in item_urls:
                    rank += 1
                    try:
                        if not 'qid' in item_url:
                            continue
                        else:
                            item_url = DOMAIN+item_url
                            cop.open(item_url)
                            page = cop.driver.page_source
                            if 'Kindle Edition' in page:
                                continue
                            item_info = extractor_page(page)
                            if 'Type the characters you see' in page  :
                                print('IP被封了',url)
                                time.sleep(10)
                                # print page
                                break
                            item_info['keyword'] = line
                            item_info['rank'] = rank
                            item_info['item_url'] = item_url.split('?')[0]
                            item_infos.append(item_info)
                    except Exception as e:
                        print(str(e))
    gen_xls(item_infos)
    cop.quit()

实际数据效果

2020-05-12 代码改成了py3的代码,用到的两个工具类(selenium_operate crawl_tool_for_py3)在另一个项目的github里有,不过不会及时更新了。

项目地址https://github.com/MemoryAndDream/weibo_craw/tree/master/weibo

selenium_operate.py

# -*- coding: utf-8 -*-
"""
File Name:     crawl_tool_for_py3
Description :
Author :       meng_zhihao
date:          2018/11/20
"""

import requests
from lxml import etree
import re

HEADERS = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}

#通用方法
class crawlerTool:

    def __init__(self):
        self.session = requests.session()
        self.proxy_now = ''
        pass

    def __del__(self):
        self.session.close()

    @staticmethod
    def get(url, proxies=None, cookies={}, referer='',headers={}):
        if not headers:
            if referer:
                headers = {'Referer': referer}
                headers.update(HEADERS)
            else:
                headers = HEADERS
        if proxies:
            rsp = requests.get(url, timeout=10, headers=headers, cookies=cookies,proxies=proxies)
        else:
            rsp = requests.get(url, timeout=30, headers=headers, cookies=cookies)
        return rsp.content  # 二进制返回

    @staticmethod
    def post(url,data,headers={}): # 必须传入字典,中文不用编码!
        if headers:
            rsp = requests.post(url, data, timeout=10,headers=headers)
        else:
            rsp = requests.post(url,data,timeout=10)
        return rsp.content


    def sget(self,url,cookies={},headers = {}):
        headers = headers.update(HEADERS)
        if cookies:
            rsp = self.session.get(url,timeout=10,headers=headers,cookies=cookies)
        else:
            rsp = self.session.get(url, timeout=10, headers=headers)
        return rsp.content # 二进制返回

    def spost(self,url,data,allow_redirects=True,headers = {}):
        headers = headers.update(HEADERS)

        rsp = self.session.post(url,data,timeout=10,headers=headers,allow_redirects=allow_redirects)
        return rsp.content



    # 获取xpath 要判断一下输入类型,或者异常处理
    @staticmethod
    def getXpath(xpath, content):   #xptah操作貌似会把中文变成转码&#xxxx;  /text()变unicode编码
        tree = etree.HTML(content)
        out = []
        results = tree.xpath(xpath)
        for result in results:
            if  'ElementStringResult' in str(type(result)) or 'ElementUnicodeResult' in str(type(result)) :
                out.append(result)
            else:
                out.append(etree.tostring(result))
        return out

    # 获取xpath 要判断一下输入类型,或者异常处理
    @staticmethod
    def getXpath1(xpath, content):   #python3的xpath传入的应该是字符串!!
        tree = etree.HTML(content)
        out = []
        results = tree.xpath(xpath)
        for result in results:
            if  'ElementStringResult' in str(type(result)) or 'ElementUnicodeResult' in str(type(result)) :
                out.append(result)
            else:
                out.append(etree.tostring(result))
        if out:
            return out[0]
        return ''

    @staticmethod
    def getRegex(regex, content):
        if type(content) == type(b''):
            content = content.decode('utf8')
        rs = re.search(regex,content)
        if rs:
            return rs.group(1)
        else:
            return ''

    @staticmethod
    def parser_urlpars(pars):
        # pars= 'q=GENER8%20MARITIME%2C%20INC.&quotesCount=6&newsCount=4&enableFuzzyQuery=false&quotesQueryId=tss_match_phrase_query&multiQuoteQueryId=multi_quote_single_token_query&newsQueryId=news_cie_vespa&enableCb=true&enableNavLinks=true&vespaNewsTimeoutMs=600'
        from urllib.parse import quote, unquote
        pars = pars.replace('+', ' ')
        field_pars = pars.split('&')
        par_dict = {}
        for field_par in field_pars:
            field = field_par.split('=')
            if len(field) == 1:
                par_dict[unquote(field[0])] = None
            else:
                par_dict[unquote(field[0])] = unquote(field[1])
        return par_dict

    @staticmethod
    def parser_cookie(cookie_str):
        # cookie_str = '_vcooline_ikcrm_production_ikcrm.com_session_=bHpKZVZSUzNoZ3MraFVBK1BrV1MvbjF1Mzdtdk9lZ0pxWm0zY1hUVDRnVXhBWTNmek9pQ2xNelJFRE1obVRoaVg4RkJldU5JR0xrKzNUSE5RUWxTWU5DSlVRVjBtTkhrMmJWMzRDTjJ3MXNhQjVwSjlsZkRsRjJYR0FUbERhdWtNdEhrM1V5emdBd1VzN0FIR0ppOTNYZjNGSHJ0cjVCcmJBVXhPdVFCdWkyeTF6b0RiRFA3ZVV0VWRNelQ5NHIweWsxSkxRSkRWbTUxaHBKaGFnc1NlSEl1MjZnQUFBdDhXUm8xV3pYWmcwVTVCbnpkT0tpNlNOZlVUdEUzUUo4YlhVelBBMkhyL3QzR0hHenpiM0hnVzZickMzTDBvMTBvNkFhenNJOWZxTGMyQk9VUnl3aVV2ellzZ05HeGZ2L1EtLUhwOWNLZEwxWm5xN0M5TjhxL2JpQVE9PQ%3D%3D--7085cd9bc2c7ff4f2e1f60bf29a838f21ef41ede; domain=.ikcrm.com; path=/'
        cookie_dict = {}
        for cookie in cookie_str.split(';'):
            l = cookie.split('=')
            key, value = l[0],'='.join(l[1:])
            cookie_dict[key] = value
        return cookie_dict

    @staticmethod
    def get_text(page_source):
        if type(page_source) == type(b''):
            page_source = page_source.decode('utf8')
        return re.sub('(<.*?>)',"",page_source)

    @staticmethod
    def quote(input):
        from urllib import parse
        return parse.quote(input)

    @staticmethod
    def unquote(input):
        from urllib import parse
        return parse.unquote(input)

    def thread_get(self):

        pass

    @staticmethod
    def get_new_5min_proxy():
        '''curl 'http://t.11jsq.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=0&fa=0&fetch_key=&qty=1&time=101&pro=&city=&port=1&format=txt&ss=1&css=&dt=1&specialTxt=3&specialJson=' -H 'Connection: keep-alive' -H 'Pragma: no-cache' -H 'Cache-Control: no-cache' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3' -H 'Referer: http://www.zhilianhttp.com/getapi.html' -H 'Accept-Encoding: gzip, deflate' -H 'Accept-Language: zh-CN,zh;q=0.9' -H 'Cookie: PHPSESSID=62eeapps46ucvcbg4qbqir15u6; SERVERID=fe3e577c4710ab81a140ce46bb723b00|1566458695|1566456808' --compressed'''
        # 网站有反爬虫 请自行去http://www.zhilianhttp.com/Shop/index.html?on=exclusive购买代理账号,充值代理币,然后提取5分钟的代理 这里用的是我的账号 但是白名单是我自己的ip  或者加大延时! http://www.zhilianhttp.com
        import requests
        url = 'http://t.11jsq.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=0&fa=0&fetch_key=&qty=1&time=101&pro=&city=&port=1&format=txt&ss=1&css=&dt=1&specialTxt=3&specialJson='
        HEADERS = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'}
        cookies = {"PHPSESSID": "0pio7gtq035godm163mnplllo5",
                   "SERVERID": "5e4e1bec6f8aa1c7027d0867db140981|1566534695|1566534695"}  # 这里替换成自己的账号cookie!
        proxy = requests.get(url, timeout=10, headers=HEADERS, cookies=cookies).text
        return proxy.strip()


    @staticmethod
    def get_new_1min_proxy():
        '''curl 'http://t.11jsq.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=0&fa=0&fetch_key=&qty=1&time=101&pro=&city=&port=1&format=txt&ss=1&css=&dt=1&specialTxt=3&specialJson=' -H 'Connection: keep-alive' -H 'Pragma: no-cache' -H 'Cache-Control: no-cache' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3' -H 'Referer: http://www.zhilianhttp.com/getapi.html' -H 'Accept-Encoding: gzip, deflate' -H 'Accept-Language: zh-CN,zh;q=0.9' -H 'Cookie: PHPSESSID=62eeapps46ucvcbg4qbqir15u6; SERVERID=fe3e577c4710ab81a140ce46bb723b00|1566458695|1566456808' --compressed'''
        # 网站有反爬虫 请自行去http://www.zhilianhttp.com/Shop/index.html?on=exclusive购买代理账号,充值代理币,然后提取5分钟的代理 这里用的是我的账号 但是白名单是我自己的ip  或者加大延时! http://www.zhilianhttp.com
        import requests
        url = 'http://ip.11jsq.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=0&fa=0&fetch_key=&qty=1&time=100&pro=&city=&port=1&format=txt&ss=1&css=&dt=1&specialTxt=3&specialJson=&usertype=2'
        HEADERS = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'}
        cookies = {"PHPSESSID": "0pio7gtq035godm163mnplllo5",
                   "SERVERID": "5e4e1bec6f8aa1c7027d0867db140981|1566534695|1566534695"}  # 这里替换成自己的账号cookie!
        proxy = requests.get(url, timeout=10, headers=HEADERS, cookies=cookies).text
        return proxy.strip()


    def get_by_proxy(self,url,success_symbol = '',failed_symbol_symbol='',cookies={}, referer=''):
        '''

        :param success_symbol:  访问成功的标志
        :param failed_symbol_symbol:  访问失败的标志
        :return:
        '''
        page_buf = ''
        for retry_times in range(2):
            if not self.proxy_now:
                self.proxy_now = self.get_new_1min_proxy()
            page_buf = self.get(url,referer=referer,cookies=cookies,proxies = {'http':'http://'+self.proxy_now ,'https':'http://'+self.proxy_now }).decode('utf8')
            if success_symbol and success_symbol in page_buf:
                return page_buf
            if failed_symbol_symbol in page_buf:
                continue
            return page_buf

    def get_regular_file_name(self,file_name):
        file_name = file_name.replace('\t','').replace('\n','')
        file_name = re.sub(u"([/\\\\:*?<>|])", "", file_name)
        return file_name


    @staticmethod
    def writer_to_csv(datas,file_path):
        import csv
        with open(file_path,'a',encoding='utf_8_sig',newline='',) as fout:
            writer = csv.writer(fout)
            for data in datas:
                writer.writerow(data)
                fout.flush()








if __name__ == '__main__':
    # content = (crawlerTool.get('http://docs.python-requests.org/zh_CN/latest/user/quickstart.html'))
    # content = (content.decode('utf8'))
    # print(crawlerTool.getXpath('//h2/text()',content))
    # ctobj = crawlerTool()
    # # ctobj.sget('https://www.ic.net.cn/')
    # #
    # p = crawlerTool.get('https://www.amazon.com/gp/search/other/ref=sr_in_A_B?rh=i%3Afashion-mens-clothing%2Cn%3A7141123011%2Cn%3A%217141124011%2Cn%3A7147441011%2Cn%3A1040658%2Cn%3A2476517011%2Ck%3At-shirt&keywords=t-shirt&pickerToList=enc-merchantbin&indexField=A&ie=UTF8&qid=1572916359')
    # print(p)
    # with open('1.html','w') as f:
    #     f.write(p.decode('utf8'))
    # print(ctobj.get_new_5min_proxy())
    # p = crawlerTool.get('http://www.google.com/',proxies={'http':'socks5://'+'127.0.0.1:1080','https':'socks5://'+'127.0.0.1:1080'}) # pip install pysocks
    # print(p)
#     # import binascii
#     # with open('28099.html',encoding='utf8') as f:
#     #     page = f.read()
#     #     contents = crawlerTool.getXpath('//span[@class="stonefont"]/text()',page)
#     #     for content in contents:
#     #         print(content)
#     #         byte_data = content.encode('utf8')  # 7 24494 74.09  '\xef\x8b\x81' 7 \xef\x8f\xa5 4  xee\x8a\x80 2
#     #         print(byte_data)
#     #         print(binascii.b2a_hex(byte_data)
# )
    print(crawlerTool().get_new_1min_proxy())

crawl_tool_for_py3.py

# -*- coding: utf-8 -*-
"""
File Name:     crawl_tool_for_py3
Description :
Author :       meng_zhihao
date:          2018/11/20
"""

import requests
from lxml import etree
import re

HEADERS = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}

#通用方法
class crawlerTool:

    def __init__(self):
        self.session = requests.session()
        self.proxy_now = ''
        pass

    def __del__(self):
        self.session.close()

    @staticmethod
    def get(url, proxies=None, cookies={}, referer='',headers={}):
        if not headers:
            if referer:
                headers = {'Referer': referer}
                headers.update(HEADERS)
            else:
                headers = HEADERS
        if proxies:
            rsp = requests.get(url, timeout=10, headers=headers, cookies=cookies,proxies=proxies)
        else:
            rsp = requests.get(url, timeout=30, headers=headers, cookies=cookies)
        return rsp.content  # 二进制返回

    @staticmethod
    def post(url,data,headers={}): # 必须传入字典,中文不用编码!
        if headers:
            rsp = requests.post(url, data, timeout=10,headers=headers)
        else:
            rsp = requests.post(url,data,timeout=10)
        return rsp.content


    def sget(self,url,cookies={},headers = {}):
        headers = headers.update(HEADERS)
        if cookies:
            rsp = self.session.get(url,timeout=10,headers=headers,cookies=cookies)
        else:
            rsp = self.session.get(url, timeout=10, headers=headers)
        return rsp.content # 二进制返回

    def spost(self,url,data,allow_redirects=True,headers = {}):
        headers = headers.update(HEADERS)

        rsp = self.session.post(url,data,timeout=10,headers=headers,allow_redirects=allow_redirects)
        return rsp.content



    # 获取xpath 要判断一下输入类型,或者异常处理
    @staticmethod
    def getXpath(xpath, content):   #xptah操作貌似会把中文变成转码&#xxxx;  /text()变unicode编码
        tree = etree.HTML(content)
        out = []
        results = tree.xpath(xpath)
        for result in results:
            if  'ElementStringResult' in str(type(result)) or 'ElementUnicodeResult' in str(type(result)) :
                out.append(result)
            else:
                out.append(etree.tostring(result))
        return out

    # 获取xpath 要判断一下输入类型,或者异常处理
    @staticmethod
    def getXpath1(xpath, content):   #python3的xpath传入的应该是字符串!!
        tree = etree.HTML(content)
        out = []
        results = tree.xpath(xpath)
        for result in results:
            if  'ElementStringResult' in str(type(result)) or 'ElementUnicodeResult' in str(type(result)) :
                out.append(result)
            else:
                out.append(etree.tostring(result))
        if out:
            return out[0]
        return ''

    @staticmethod
    def getRegex(regex, content):
        if type(content) == type(b''):
            content = content.decode('utf8')
        rs = re.search(regex,content)
        if rs:
            return rs.group(1)
        else:
            return ''

    @staticmethod
    def parser_urlpars(pars):
        # pars= 'q=GENER8%20MARITIME%2C%20INC.&quotesCount=6&newsCount=4&enableFuzzyQuery=false&quotesQueryId=tss_match_phrase_query&multiQuoteQueryId=multi_quote_single_token_query&newsQueryId=news_cie_vespa&enableCb=true&enableNavLinks=true&vespaNewsTimeoutMs=600'
        from urllib.parse import quote, unquote
        pars = pars.replace('+', ' ')
        field_pars = pars.split('&')
        par_dict = {}
        for field_par in field_pars:
            field = field_par.split('=')
            if len(field) == 1:
                par_dict[unquote(field[0])] = None
            else:
                par_dict[unquote(field[0])] = unquote(field[1])
        return par_dict

    @staticmethod
    def parser_cookie(cookie_str):
        # cookie_str = '_vcooline_ikcrm_production_ikcrm.com_session_=bHpKZVZSUzNoZ3MraFVBK1BrV1MvbjF1Mzdtdk9lZ0pxWm0zY1hUVDRnVXhBWTNmek9pQ2xNelJFRE1obVRoaVg4RkJldU5JR0xrKzNUSE5RUWxTWU5DSlVRVjBtTkhrMmJWMzRDTjJ3MXNhQjVwSjlsZkRsRjJYR0FUbERhdWtNdEhrM1V5emdBd1VzN0FIR0ppOTNYZjNGSHJ0cjVCcmJBVXhPdVFCdWkyeTF6b0RiRFA3ZVV0VWRNelQ5NHIweWsxSkxRSkRWbTUxaHBKaGFnc1NlSEl1MjZnQUFBdDhXUm8xV3pYWmcwVTVCbnpkT0tpNlNOZlVUdEUzUUo4YlhVelBBMkhyL3QzR0hHenpiM0hnVzZickMzTDBvMTBvNkFhenNJOWZxTGMyQk9VUnl3aVV2ellzZ05HeGZ2L1EtLUhwOWNLZEwxWm5xN0M5TjhxL2JpQVE9PQ%3D%3D--7085cd9bc2c7ff4f2e1f60bf29a838f21ef41ede; domain=.ikcrm.com; path=/'
        cookie_dict = {}
        for cookie in cookie_str.split(';'):
            l = cookie.split('=')
            key, value = l[0],'='.join(l[1:])
            cookie_dict[key] = value
        return cookie_dict

    @staticmethod
    def get_text(page_source):
        if type(page_source) == type(b''):
            page_source = page_source.decode('utf8')
        return re.sub('(<.*?>)',"",page_source)

    @staticmethod
    def quote(input):
        from urllib import parse
        return parse.quote(input)

    @staticmethod
    def unquote(input):
        from urllib import parse
        return parse.unquote(input)

    def thread_get(self):

        pass

    @staticmethod
    def get_new_5min_proxy():
        '''curl 'http://t.11jsq.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=0&fa=0&fetch_key=&qty=1&time=101&pro=&city=&port=1&format=txt&ss=1&css=&dt=1&specialTxt=3&specialJson=' -H 'Connection: keep-alive' -H 'Pragma: no-cache' -H 'Cache-Control: no-cache' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3' -H 'Referer: http://www.zhilianhttp.com/getapi.html' -H 'Accept-Encoding: gzip, deflate' -H 'Accept-Language: zh-CN,zh;q=0.9' -H 'Cookie: PHPSESSID=62eeapps46ucvcbg4qbqir15u6; SERVERID=fe3e577c4710ab81a140ce46bb723b00|1566458695|1566456808' --compressed'''
        # 网站有反爬虫 请自行去http://www.zhilianhttp.com/Shop/index.html?on=exclusive购买代理账号,充值代理币,然后提取5分钟的代理 这里用的是我的账号 但是白名单是我自己的ip  或者加大延时! http://www.zhilianhttp.com
        import requests
        url = 'http://t.11jsq.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=0&fa=0&fetch_key=&qty=1&time=101&pro=&city=&port=1&format=txt&ss=1&css=&dt=1&specialTxt=3&specialJson='
        HEADERS = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'}
        cookies = {"PHPSESSID": "0pio7gtq035godm163mnplllo5",
                   "SERVERID": "5e4e1bec6f8aa1c7027d0867db140981|1566534695|1566534695"}  # 这里替换成自己的账号cookie!
        proxy = requests.get(url, timeout=10, headers=HEADERS, cookies=cookies).text
        return proxy.strip()


    @staticmethod
    def get_new_1min_proxy():
        '''curl 'http://t.11jsq.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=0&fa=0&fetch_key=&qty=1&time=101&pro=&city=&port=1&format=txt&ss=1&css=&dt=1&specialTxt=3&specialJson=' -H 'Connection: keep-alive' -H 'Pragma: no-cache' -H 'Cache-Control: no-cache' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3' -H 'Referer: http://www.zhilianhttp.com/getapi.html' -H 'Accept-Encoding: gzip, deflate' -H 'Accept-Language: zh-CN,zh;q=0.9' -H 'Cookie: PHPSESSID=62eeapps46ucvcbg4qbqir15u6; SERVERID=fe3e577c4710ab81a140ce46bb723b00|1566458695|1566456808' --compressed'''
        # 网站有反爬虫 请自行去http://www.zhilianhttp.com/Shop/index.html?on=exclusive购买代理账号,充值代理币,然后提取5分钟的代理 这里用的是我的账号 但是白名单是我自己的ip  或者加大延时! http://www.zhilianhttp.com
        import requests
        url = 'http://ip.11jsq.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=0&fa=0&fetch_key=&qty=1&time=100&pro=&city=&port=1&format=txt&ss=1&css=&dt=1&specialTxt=3&specialJson=&usertype=2'
        HEADERS = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'}
        cookies = {"PHPSESSID": "0pio7gtq035godm163mnplllo5",
                   "SERVERID": "5e4e1bec6f8aa1c7027d0867db140981|1566534695|1566534695"}  # 这里替换成自己的账号cookie!
        proxy = requests.get(url, timeout=10, headers=HEADERS, cookies=cookies).text
        return proxy.strip()


    def get_by_proxy(self,url,success_symbol = '',failed_symbol_symbol='',cookies={}, referer=''):
        '''

        :param success_symbol:  访问成功的标志
        :param failed_symbol_symbol:  访问失败的标志
        :return:
        '''
        page_buf = ''
        for retry_times in range(2):
            if not self.proxy_now:
                self.proxy_now = self.get_new_1min_proxy()
            page_buf = self.get(url,referer=referer,cookies=cookies,proxies = {'http':'http://'+self.proxy_now ,'https':'http://'+self.proxy_now }).decode('utf8')
            if success_symbol and success_symbol in page_buf:
                return page_buf
            if failed_symbol_symbol in page_buf:
                continue
            return page_buf

    def get_regular_file_name(self,file_name):
        file_name = file_name.replace('\t','').replace('\n','')
        file_name = re.sub(u"([/\\\\:*?<>|])", "", file_name)
        return file_name


    @staticmethod
    def writer_to_csv(datas,file_path):
        import csv
        with open(file_path,'a',encoding='utf_8_sig',newline='',) as fout:
            writer = csv.writer(fout)
            for data in datas:
                writer.writerow(data)
                fout.flush()








if __name__ == '__main__':
    # content = (crawlerTool.get('http://docs.python-requests.org/zh_CN/latest/user/quickstart.html'))
    # content = (content.decode('utf8'))
    # print(crawlerTool.getXpath('//h2/text()',content))
    # ctobj = crawlerTool()
    # # ctobj.sget('https://www.ic.net.cn/')
    # #
    # p = crawlerTool.get('https://www.amazon.com/gp/search/other/ref=sr_in_A_B?rh=i%3Afashion-mens-clothing%2Cn%3A7141123011%2Cn%3A%217141124011%2Cn%3A7147441011%2Cn%3A1040658%2Cn%3A2476517011%2Ck%3At-shirt&keywords=t-shirt&pickerToList=enc-merchantbin&indexField=A&ie=UTF8&qid=1572916359')
    # print(p)
    # with open('1.html','w') as f:
    #     f.write(p.decode('utf8'))
    # print(ctobj.get_new_5min_proxy())
    # p = crawlerTool.get('http://www.google.com/',proxies={'http':'socks5://'+'127.0.0.1:1080','https':'socks5://'+'127.0.0.1:1080'}) # pip install pysocks
    # print(p)
#     # import binascii
#     # with open('28099.html',encoding='utf8') as f:
#     #     page = f.read()
#     #     contents = crawlerTool.getXpath('//span[@class="stonefont"]/text()',page)
#     #     for content in contents:
#     #         print(content)
#     #         byte_data = content.encode('utf8')  # 7 24494 74.09  '\xef\x8b\x81' 7 \xef\x8f\xa5 4  xee\x8a\x80 2
#     #         print(byte_data)
#     #         print(binascii.b2a_hex(byte_data)
# )
    print(crawlerTool().get_new_1min_proxy())

 

 


————————————————
版权声明:本文为CSDN博主「Memory_qq312141830」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/Memory_and_Dream/article/details/90040018

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值