使用代理池爬取中国公路物流数据

本文链接：https://blog.csdn.net/qq_40082282/article/details/103305242

分析：
1.通过chrome浏览器F12找到其ajax接口
2.通过post方法发送请求获取json格式数据
3.构造代理池进行爬取

1.代理池：

会浏览代理网页，然后测试每一个代理查看是否能够成功访问目标网页，如果成功则保存人D盘文本文件。

浏览快代理网页，遍历每一个代理，检测是否可用，一般有5个可用代理便可以开始启动爬虫程序了，之后爬虫程序和代理程序会一起运行，保证持续会有有效代理输入。
在这里插入图片描述
缺陷：代理池中的代理未必都可用，因为有些代理是测试的时候可用但之后不可用，有些代理是刚开始不可用后来可用。

import requests
from bs4 import BeautifulSoup
import time
class get_proxies(object):
    def __init__(self):
        #proxy_list=[]
        self.headers={'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                 'Accept-Encoding': 'gzip, deflate, br',
                 'Accept-Language': 'zh-CN,zh;q=0.9',
                 'Cache-Control': 'max-age=0',
                 'Connection': 'keep-alive',
                 'Cookie': 'channelid=0; sid=1574476990415840; _ga=GA1.2.1841802931.1574478582; _gid=GA1.2.556817404.1574478582; Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1573801017,1573805424,1574478582; Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1574478732',
                 'Host':' www.kuaidaili.com',
                 'Referer': 'https://www.kuaidaili.com/free/inha/6/',
                 'Upgrade-Insecure-Requests': 1,
                 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
        self.test_headers={'Accept': 'application/json, text/javascript, */*; q=0.01',
                      'Accept-Encoding': 'gzip, deflate',
                      'Accept-Language': 'zh-CN,zh;q=0.9',
                      'Connection': 'keep-alive',
                      'Content-Length': '105',
                      'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
                      'Cookie': 'clientlanguage=zh_CN; JSESSIONID=8348A6C6CADA87FF9ECF0A87DAB96AC4',
                      'Host': 'index.0256.cn',
                      'Origin': 'http://index.0256.cn',
                      'Referer': 'http://index.0256.cn/expx.htm',
                      'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
                      'X-Requested-With': 'XMLHttpRequest'}
        
        self.ip_score=IP_SCORE()
        
    def get_page(self,url):
        try:
            response=requests.get(url,headers=self.headers)
            response.raise_for_status()#判断是否获取response成功
            return response.text
        except:
            print('无法获取代理页面'+url)
            return ''
        
            
    def kuaidaili(self,startpage,endpage):
        start_url='https://www.kuaidaili.com/free/inha/{page}/'
        for page in range(startpage,endpage):
            url=start_url.format(page=page)
            time.sleep(1)
            print('获取代理页面'+url)
            response=self.get_page(url)
            if response:
                soup=BeautifulSoup(response,'lxml')
                trs=soup.tbody.find_all(name='tr')
                for tr in trs:
                    ip=tr.find(attrs={'data-title':'IP'}).get_text()
                    port=tr.find(attrs={'data-title':'PORT'}).get_text()
                    proxy=ip+':'+port
                    yield proxy
        
    def to_txt(self,proxy):
        with open('D:/ip代理.txt','a',encoding='utf-8') as f:
            f.write(proxy+'\n')

    def test(self,proxy):
        test_url='http://index.0256.cn/expcenter_trend.action'
        proxy_host="http://" + proxy
        proxy2={'http':proxy_host}
        data={'marketId':1,'attribute1':1,'exponentTypeId':2,'cateId':2, 'attribute2':'华北','city':'','startLine':'吉林','endLine':'大连'}
        try:
            response=requests.post(test_url,data=data,headers=self.test_headers,proxies=proxy2,timeout=2)
            response.raise_for_status()
            print('获取有效代理:'+proxy_host)
            self.ip_score.score(proxy,wl=1)
            self.to_txt(proxy)
        except:
            pass
            
    def main(self,startpage,endpage):
        for proxy in self.kuaidaili(startpage,endpage):
            self.test(proxy)
            
    def run(self):
        task1=threading.Thread(target=self.main,args=(1,300,))
        task2=threading.Thread(target=self.main,args=(300,600,))
        task3=threading.Thread(target=self.main,args=(600,900,))
        task1.start()
        time.sleep(3)
        task2.start()
        time.sleep(3)
        task3.start()

获取代理如下所示：
在这里插入图片描述

2.为代理池设置分数系统：

get_all_ip 方法会首先读取保存于本地的全部ip代理，为每一个ip代理设置初始分数为20分，
score方法会为代理进行打分，假如该代理能够成功获取网页则代理分数加10分，假如不能获取网页，则分数减1分；
get_ip方法会将代理按评分进行排序，在分数最高的4个代理中随机选取一个进行调用
程序如下：

import random

class IP_SCORE(object):
    def __init__(self):
        self.ip_score={}
        
    def score(self,ip,wl=0):
        '''
        为代理进行打分
        wl==0:代表新代理，初始分20
        wl==1：代表成功代理，加10分
        不成功代理：减1分
        '''        
        if wl==0:
            self.ip_score[ip]=self.ip_score.get(ip,20)
        elif wl==1:
            self.ip_score[ip]=self.ip_score.get(ip,20)+10
        else:
            self.ip_score[ip]=self.ip_score.get(ip,20)-1
            
    def get_ip(self):
        '将代理按照分数进行排序，选取分数最高的4个代理进行调用'
        ip=sorted(self.ip_score.items(), key=lambda e:e[1], reverse=True)
        return random.choice(ip[:5])[0]
    
    def get_all_ip(self):
        with open('D:/ip代理.txt','r',encoding='utf-8') as f:
            ips=f.readlines()
        for ip in ips:
            self.score(ip.strip('\n'),0)

3.爬取中国公路物流数据程序：

该程序会首先读取保存于本地的网址，然后利用process_url方法对网址进行解析，再通过get_response进行构造请求，get_response在调用过程中会使用上面介绍的IP_SCORE方法来获取评分较高的代理，再之后使用parse_response方法可以将获取的json数据进行解析，最后调用to_csv方法将数据保存进csv文件。

import json
import re
import time
import csv
import requests
import time
import random
import urllib.parse
from bs4 import BeautifulSoup
import threading

class get_proxies(object):
    def __init__(self):
        #proxy_list=[]
        self.headers={'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                 'Accept-Encoding': 'gzip, deflate, br',
                 'Accept-Language': 'zh-CN,zh;q=0.9',
                 'Cache-Control': 'max-age=0',
                 'Connection': 'keep-alive',
                 'Cookie': 'channelid=0; sid=1574476990415840; _ga=GA1.2.1841802931.1574478582; _gid=GA1.2.556817404.1574478582; Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1573801017,1573805424,1574478582; Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1574478732',
                 'Host':' www.kuaidaili.com',
                 'Referer': 'https://www.kuaidaili.com/free/inha/6/',
                 'Upgrade-Insecure-Requests': 1,
                 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
        self.test_headers={'Accept': 'application/json, text/javascript, */*; q=0.01',
                      'Accept-Encoding': 'gzip, deflate',
                      'Accept-Language': 'zh-CN,zh;q=0.9',
                      'Connection': 'keep-alive',
                      'Content-Length': '105',
                      'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
                      'Cookie': 'clientlanguage=zh_CN; JSESSIONID=8348A6C6CADA87FF9ECF0A87DAB96AC4',
                      'Host': 'index.0256.cn',
                      'Origin': 'http://index.0256.cn',
                      'Referer': 'http://index.0256.cn/expx.htm',
                      'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
                      'X-Requested-With': 'XMLHttpRequest'}
        
        self.ip_score=IP_SCORE()
        
    def get_page(self,url):
        try:
            response=requests.get(url,headers=self.headers)
            response.raise_for_status()#判断是否获取response成功
            return response.text
        except:
            print('无法获取代理页面'+url)
            return ''
        
            
    def kuaidaili(self,startpage,endpage):
        start_url='https://www.kuaidaili.com/free/inha/{page}/'
        for page in range(startpage,endpage):
            url=start_url.format(page=page)
            time.sleep(1)
            print('获取代理页面'+url)
            response=self.get_page(url)
            if response:
                soup=BeautifulSoup(response,'lxml')
                trs=soup.tbody.find_all(name='tr')
                for tr in trs:
                    ip=tr.find(attrs={'data-title':'IP'}).get_text()
                    port=tr.find(attrs={'data-title':'PORT'}).get_text()
                    proxy=ip+':'+port
                    yield proxy
        
    def to_txt(self,proxy):
        with open('D:/ip代理.txt','a',encoding='utf-8') as f:
            f.write(proxy+'\n')

    def test(self,proxy):
        test_url='http://index.0256.cn/expcenter_trend.action'
        proxy_host="http://" + proxy
        proxy2={'http':proxy_host}
        data={'marketId':1,'attribute1':1,'exponentTypeId':2,'cateId':2, 'attribute2':'华北','city':'','startLine':'吉林','endLine':'大连'}
        try:
            response=requests.post(test_url,data=data,headers=self.test_headers,proxies=proxy2,timeout=2)
            response.raise_for_status()
            print('获取有效代理:'+proxy_host)
            self.ip_score.score(proxy,wl=1)
            self.to_txt(proxy)
        except:
            pass
            
    def main(self,startpage,endpage):
        for proxy in self.kuaidaili(startpage,endpage):
            self.test(proxy)
            
    def run(self):
        task1=threading.Thread(target=self.main,args=(1,300,))
        task2=threading.Thread(target=self.main,args=(300,600,))
        task3=threading.Thread(target=self.main,args=(600,900,))
        task4=threading.Thread(target=self.main,args=(1000,1300,))
        #task5=threading.Thread(target=self.main,args=(1600,1900,))
        task1.start()
        time.sleep(3)
        task2.start()
        time.sleep(3)
        task3.start()
        time.sleep(3)
        task4.start()
        time.sleep(3)
        task5.start()


class IP_SCORE(object):
    def __init__(self):
        self.ip_score={}
        
    def score(self,ip,wl=0):
        '''
        为代理进行打分
        wl==0:代表新代理，初始分20
        wl==1：代表成功代理，加10分
        不成功代理：减1分
        '''        
        if wl==0:
            self.ip_score[ip]=self.ip_score.get(ip,20)
        elif wl==1:
            self.ip_score[ip]=self.ip_score.get(ip,20)+10
        else:
            self.ip_score[ip]=self.ip_score.get(ip,20)-1
            
    def get_ip(self):
        ip=sorted(self.ip_score.items(), key=lambda e:e[1], reverse=True)
        return random.choice(ip[:3])[0]
    
    def get_all_ip(self):
        with open('D:/ip代理.txt','r',encoding='utf-8') as f:
            ips=f.readlines()
        for ip in ips:
            self.score(ip.strip('\n'),0)

    
class heyuling(object):
    def __init__(self,url_filename,data_filename):
        #self.data=data
        self.url_filename=url_filename
        self.data_filename=data_filename
        self.url='http://index.0256.cn/expcenter_trend.action'
        self.headers={'Accept': 'application/json, text/javascript, */*; q=0.01',
                      'Accept-Encoding': 'gzip, deflate',
                      'Accept-Language': 'zh-CN,zh;q=0.9',
                      'Connection': 'keep-alive',
                      'Content-Length': '105',
                      'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
                      'Cookie': 'clientlanguage=zh_CN; JSESSIONID=8348A6C6CADA87FF9ECF0A87DAB96AC4',
                      'Host': 'index.0256.cn',
                      'Origin': 'http://index.0256.cn',
                      'Referer': 'http://index.0256.cn/expx.htm',
                      'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
                      'X-Requested-With': 'XMLHttpRequest'}
        self.ip_score=IP_SCORE()
        self.get_proxies=get_proxies()
        
        
    def get_response(self,url,data,headers,code):
        #self.ip_score.get_all_ip()
        ip=self.ip_score.get_ip()#获取经过评分后的ip
        try:
            proxy={'http':'http://'+ip}
            response=requests.request("POST", url, data=data, headers=headers,timeout=2,proxies=proxy)
            response.raise_for_status()
            d=self.parse_response(response,code,self.url,data,self.headers)
            self.ip_score.score(ip,1)
            print('成功获取链接：{data}'.format(data=data))
            return d
        except:
            print('获取链接失败：{data}'.format(data=data)+'...开始重试')
            self.ip_score.score(ip,-1)
            return self.get_response(url,data,headers,code)
    
    def get_proxy(self):
        #开始爬取代理
        self.get_proxies.run()
            
    def parse_response(self,response,code,url,data,headers):
        inf=response.text
        inf_json=json.loads(inf)
        data={}
        date=inf_json['chart1']['xLebal'][-1]
        timearray=time.strptime(date,'%Y-%m-%d')
        data['date']=time.strftime('%Y%m%d',timearray)
        data['value']=inf_json['chart1']['yLebal'][-1]
        data['code']=code
        return data
        
    def to_csv(self,filename,data):
        with open(filename,'a',encoding='utf-8',newline='') as f:
            writer=csv.writer(f)
            writer.writerow([data['code'],data['date'],data['value']])
    
    def get_urls(self,filename):
        with open(filename,'r',encoding='gbk') as f:
            urls=f.readlines()
        for u in urls:
            url=u.split('\t')[0]
            code=u.split('\t')[1].strip()
            yield [url,code]
            
    def process_url(self,url):
        '''
        从文本文件里读取网址并解析成url格式
        '''
        repl=lambda matched :urllib.parse.quote(matched.group(0))
        new_url=re.sub('[\u4e00-\u9fa5]+',repl,url)
        return new_url
    
    def main(self):
        infs=self.get_urls(self.url_filename)
        self.ip_score.get_all_ip()
        num=0
        for inf in infs:
            url=inf[0]
            code=inf[1]
            num+=1
            data=self.process_url(url)
            d=self.get_response(self.url,data,self.headers,code)
            print('开始保存第{n}条数据'.format(n=num))
            self.to_csv(self.data_filename,d)
    
    def run(self):
        SpiderThread=threading.Thread(target=self.main,args=())
        IPThread=threading.Thread(target=self.get_proxy,args=())
        SpiderThread.start()
        IPThread.start()
        SpiderThread.join()
        IPThread.join()
    
 

if __name__=='__main__':
    #urlfile=input('请输入网址所在的文件')
    #datafile=input('请输入文件保存位置') 
    url_file='D:/中国公路物流运价指数/URL月.txt'
    data_file='D:/中国公路物流运价指数/202001011.csv'
    hyl=heyuling(url_file,data_file)  
    hyl.run()

在这里插入图片描述