使用代理池爬取中国公路物流数据

分析:
1.通过chrome浏览器F12找到其ajax接口
2.通过post方法发送请求获取json格式数据
3.构造代理池进行爬取

1.代理池:

会浏览代理网页,然后测试每一个代理查看是否能够成功访问目标网页,如果成功则保存人D盘文本文件

浏览快代理网页,遍历每一个代理,检测是否可用,一般有5个可用代理便可以开始启动爬虫程序了,之后爬虫程序和代理程序会一起运行,保证持续会有有效代理输入。
在这里插入图片描述
缺陷:代理池中的代理未必都可用,因为有些代理是测试的时候可用但之后不可用,有些代理是刚开始不可用后来可用。

import requests
from bs4 import BeautifulSoup
import time
class get_proxies(object):
    def __init__(self):
        #proxy_list=[]
        self.headers={'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                 'Accept-Encoding': 'gzip, deflate, br',
                 'Accept-Language': 'zh-CN,zh;q=0.9',
                 'Cache-Control': 'max-age=0',
                 'Connection': 'keep-alive',
                 'Cookie': 'channelid=0; sid=1574476990415840; _ga=GA1.2.1841802931.1574478582; _gid=GA1.2.556817404.1574478582; Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1573801017,1573805424,1574478582; Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1574478732',
                 'Host':' www.kuaidaili.com',
                 'Referer': 'https://www.kuaidaili.com/free/inha/6/',
                 'Upgrade-Insecure-Requests': 1,
                 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
        self.test_headers={'Accept': 'application/json, text/javascript, */*; q=0.01',
                      'Accept-Encoding': 'gzip, deflate',
                      'Accept-Language': 'zh-CN,zh;q=0.9',
                      'Connection': 'keep-alive',
                      'Content-Length': '105',
                      'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
                      'Cookie': 'clientlanguage=zh_CN; JSESSIONID=8348A6C6CADA87FF9ECF0A87DAB96AC4',
                      'Host': 'index.0256.cn',
                      'Origin': 'http://index.0256.cn',
                      'Referer': 'http://index.0256.cn/expx.htm',
                      'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
                      'X-Requested-With': 'XMLHttpRequest'}
        
        self.ip_score=IP_SCORE()
        
    def get_page(self,url):
        try:
            response=requests.get(url,headers=self.headers)
            response.raise_for_status()#判断是否获取response成功
            return response.text
        except:
            print('无法获取代理页面'+url)
            return ''
        
            
    def kuaidaili(self,startpage,endpage):
        start_url='https://www.kuaidaili.com/free/inha/{page}/'
        for page in range(startpage,endpage):
            url=start_url.format(page=page)
            time.sleep(1)
            print('获取代理页面'+url)
            response=self.get_page(url)
            if response:
                soup=BeautifulSoup(response,'lxml')
                trs=soup.tbody.find_all(name='tr')
                for tr in trs:
                    ip=tr.find(attrs={'data-title':'IP'}).get_text()
                    port=tr.find(attrs={'data-title':'PORT'}).get_text()
                    proxy=ip+':'+port
                    yield proxy
        
    def to_txt(self,proxy):
        with open('D:/ip代理.txt','a',encoding='utf-8') as f:
            f.write(proxy+'\n')

    def test(self,proxy):
        test_url='http://index.0256.cn/expcenter_trend.action'
        proxy_host="http://" + proxy
        proxy2={'http':proxy_host}
        data={'marketId':1,'attribute1':1,'exponentTypeId':2,'cateId':2, 'attribute2':'华北','city':'','startLine':'吉林','endLine':'大连'}
        try:
            response=requests.post(test_url,data=data,headers=self.test_headers,proxies=proxy2,timeout=2)
            response.raise_for_status()
            print('获取有效代理:'+proxy_host)
            self.ip_score.score(proxy,wl=1)
            self.to_txt(proxy)
        except:
            pass
            
    def main(self,startpage,endpage):
        for proxy in self.kuaidaili(startpage,endpage):
            self.test(proxy)
            
    def run(self):
        task1=threading.Thread(target=self.main,args=(1,300,))
        task2=threading.Thread(target=self.main,args=(300,600,))
        task3=threading.Thread(target=self.main,args=(600,900,))
        task1.start()
        time.sleep(3)
        task2.start()
        time.sleep(3)
        task3.start()

获取代理如下所示:
在这里插入图片描述

2.为代理池设置分数系统:

get_all_ip 方法会首先读取保存于本地的全部ip代理,为每一个ip代理设置初始分数为20分,
score方法会为代理进行打分,假如该代理能够成功获取网页则代理分数加10分,假如不能获取网页,则分数减1分;
get_ip方法会将代理按评分进行排序,在分数最高的4个代理中随机选取一个进行调用
程序如下:

import random

class IP_SCORE(object):
    def __init__(self):
        self.ip_score={}
        
    def score(self,ip,wl=0):
        '''
        为代理进行打分
        wl==0:代表新代理,初始分20
        wl==1:代表成功代理,加10分
        不成功代理:减1分
        '''        
        if wl==0:
            self.ip_score[ip]=self.ip_score.get(ip,20)
        elif wl==1:
            self.ip_score[ip]=self.ip_score.get(ip,20)+10
        else:
            self.ip_score[ip]=self.ip_score.get(ip,20)-1
            
    def get_ip(self):
        '将代理按照分数进行排序,选取分数最高的4个代理进行调用'
        ip=sorted(self.ip_score.items(), key=lambda e:e[1], reverse=True)
        return random.choice(ip[:5])[0]
    
    def get_all_ip(self):
        with open('D:/ip代理.txt','r',encoding='utf-8') as f:
            ips=f.readlines()
        for ip in ips:
            self.score(ip.strip('\n'),0)

3.爬取中国公路物流数据程序:

该程序会首先读取保存于本地的网址,然后利用process_url方法对网址进行解析,再通过get_response进行构造请求,get_response在调用过程中会使用上面介绍的IP_SCORE方法来获取评分较高的代理,再之后使用parse_response方法可以将获取的json数据进行解析,最后调用to_csv方法将数据保存进csv文件。

import json
import re
import time
import csv
import requests
import time
import random
import urllib.parse
from bs4 import BeautifulSoup
import threading

class get_proxies(object):
    def __init__(self):
        #proxy_list=[]
        self.headers={'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                 'Accept-Encoding': 'gzip, deflate, br',
                 'Accept-Language': 'zh-CN,zh;q=0.9',
                 'Cache-Control': 'max-age=0',
                 'Connection': 'keep-alive',
                 'Cookie': 'channelid=0; sid=1574476990415840; _ga=GA1.2.1841802931.1574478582; _gid=GA1.2.556817404.1574478582; Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1573801017,1573805424,1574478582; Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1574478732',
                 'Host':' www.kuaidaili.com',
                 'Referer': 'https://www.kuaidaili.com/free/inha/6/',
                 'Upgrade-Insecure-Requests': 1,
                 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
        self.test_headers={'Accept': 'application/json, text/javascript, */*; q=0.01',
                      'Accept-Encoding': 'gzip, deflate',
                      'Accept-Language': 'zh-CN,zh;q=0.9',
                      'Connection': 'keep-alive',
                      'Content-Length': '105',
                      'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
                      'Cookie': 'clientlanguage=zh_CN; JSESSIONID=8348A6C6CADA87FF9ECF0A87DAB96AC4',
                      'Host': 'index.0256.cn',
                      'Origin': 'http://index.0256.cn',
                      'Referer': 'http://index.0256.cn/expx.htm',
                      'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
                      'X-Requested-With': 'XMLHttpRequest'}
        
        self.ip_score=IP_SCORE()
        
    def get_page(self,url):
        try:
            response=requests.get(url,headers=self.headers)
            response.raise_for_status()#判断是否获取response成功
            return response.text
        except:
            print('无法获取代理页面'+url)
            return ''
        
            
    def kuaidaili(self,startpage,endpage):
        start_url='https://www.kuaidaili.com/free/inha/{page}/'
        for page in range(startpage,endpage):
            url=start_url.format(page=page)
            time.sleep(1)
            print('获取代理页面'+url)
            response=self.get_page(url)
            if response:
                soup=BeautifulSoup(response,'lxml')
                trs=soup.tbody.find_all(name='tr')
                for tr in trs:
                    ip=tr.find(attrs={'data-title':'IP'}).get_text()
                    port=tr.find(attrs={'data-title':'PORT'}).get_text()
                    proxy=ip+':'+port
                    yield proxy
        
    def to_txt(self,proxy):
        with open('D:/ip代理.txt','a',encoding='utf-8') as f:
            f.write(proxy+'\n')

    def test(self,proxy):
        test_url='http://index.0256.cn/expcenter_trend.action'
        proxy_host="http://" + proxy
        proxy2={'http':proxy_host}
        data={'marketId':1,'attribute1':1,'exponentTypeId':2,'cateId':2, 'attribute2':'华北','city':'','startLine':'吉林','endLine':'大连'}
        try:
            response=requests.post(test_url,data=data,headers=self.test_headers,proxies=proxy2,timeout=2)
            response.raise_for_status()
            print('获取有效代理:'+proxy_host)
            self.ip_score.score(proxy,wl=1)
            self.to_txt(proxy)
        except:
            pass
            
    def main(self,startpage,endpage):
        for proxy in self.kuaidaili(startpage,endpage):
            self.test(proxy)
            
    def run(self):
        task1=threading.Thread(target=self.main,args=(1,300,))
        task2=threading.Thread(target=self.main,args=(300,600,))
        task3=threading.Thread(target=self.main,args=(600,900,))
        task4=threading.Thread(target=self.main,args=(1000,1300,))
        #task5=threading.Thread(target=self.main,args=(1600,1900,))
        task1.start()
        time.sleep(3)
        task2.start()
        time.sleep(3)
        task3.start()
        time.sleep(3)
        task4.start()
        time.sleep(3)
        task5.start()


class IP_SCORE(object):
    def __init__(self):
        self.ip_score={}
        
    def score(self,ip,wl=0):
        '''
        为代理进行打分
        wl==0:代表新代理,初始分20
        wl==1:代表成功代理,加10分
        不成功代理:减1分
        '''        
        if wl==0:
            self.ip_score[ip]=self.ip_score.get(ip,20)
        elif wl==1:
            self.ip_score[ip]=self.ip_score.get(ip,20)+10
        else:
            self.ip_score[ip]=self.ip_score.get(ip,20)-1
            
    def get_ip(self):
        ip=sorted(self.ip_score.items(), key=lambda e:e[1], reverse=True)
        return random.choice(ip[:3])[0]
    
    def get_all_ip(self):
        with open('D:/ip代理.txt','r',encoding='utf-8') as f:
            ips=f.readlines()
        for ip in ips:
            self.score(ip.strip('\n'),0)

    
class heyuling(object):
    def __init__(self,url_filename,data_filename):
        #self.data=data
        self.url_filename=url_filename
        self.data_filename=data_filename
        self.url='http://index.0256.cn/expcenter_trend.action'
        self.headers={'Accept': 'application/json, text/javascript, */*; q=0.01',
                      'Accept-Encoding': 'gzip, deflate',
                      'Accept-Language': 'zh-CN,zh;q=0.9',
                      'Connection': 'keep-alive',
                      'Content-Length': '105',
                      'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
                      'Cookie': 'clientlanguage=zh_CN; JSESSIONID=8348A6C6CADA87FF9ECF0A87DAB96AC4',
                      'Host': 'index.0256.cn',
                      'Origin': 'http://index.0256.cn',
                      'Referer': 'http://index.0256.cn/expx.htm',
                      'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
                      'X-Requested-With': 'XMLHttpRequest'}
        self.ip_score=IP_SCORE()
        self.get_proxies=get_proxies()
        
        
    def get_response(self,url,data,headers,code):
        #self.ip_score.get_all_ip()
        ip=self.ip_score.get_ip()#获取经过评分后的ip
        try:
            proxy={'http':'http://'+ip}
            response=requests.request("POST", url, data=data, headers=headers,timeout=2,proxies=proxy)
            response.raise_for_status()
            d=self.parse_response(response,code,self.url,data,self.headers)
            self.ip_score.score(ip,1)
            print('成功获取链接:{data}'.format(data=data))
            return d
        except:
            print('获取链接失败:{data}'.format(data=data)+'...开始重试')
            self.ip_score.score(ip,-1)
            return self.get_response(url,data,headers,code)
    
    def get_proxy(self):
        #开始爬取代理
        self.get_proxies.run()
            
    def parse_response(self,response,code,url,data,headers):
        inf=response.text
        inf_json=json.loads(inf)
        data={}
        date=inf_json['chart1']['xLebal'][-1]
        timearray=time.strptime(date,'%Y-%m-%d')
        data['date']=time.strftime('%Y%m%d',timearray)
        data['value']=inf_json['chart1']['yLebal'][-1]
        data['code']=code
        return data
        
    def to_csv(self,filename,data):
        with open(filename,'a',encoding='utf-8',newline='') as f:
            writer=csv.writer(f)
            writer.writerow([data['code'],data['date'],data['value']])
    
    def get_urls(self,filename):
        with open(filename,'r',encoding='gbk') as f:
            urls=f.readlines()
        for u in urls:
            url=u.split('\t')[0]
            code=u.split('\t')[1].strip()
            yield [url,code]
            
    def process_url(self,url):
        '''
        从文本文件里读取网址并解析成url格式
        '''
        repl=lambda matched :urllib.parse.quote(matched.group(0))
        new_url=re.sub('[\u4e00-\u9fa5]+',repl,url)
        return new_url
    
    def main(self):
        infs=self.get_urls(self.url_filename)
        self.ip_score.get_all_ip()
        num=0
        for inf in infs:
            url=inf[0]
            code=inf[1]
            num+=1
            data=self.process_url(url)
            d=self.get_response(self.url,data,self.headers,code)
            print('开始保存第{n}条数据'.format(n=num))
            self.to_csv(self.data_filename,d)
    
    def run(self):
        SpiderThread=threading.Thread(target=self.main,args=())
        IPThread=threading.Thread(target=self.get_proxy,args=())
        SpiderThread.start()
        IPThread.start()
        SpiderThread.join()
        IPThread.join()
    
 

if __name__=='__main__':
    #urlfile=input('请输入网址所在的文件')
    #datafile=input('请输入文件保存位置') 
    url_file='D:/中国公路物流运价指数/URL月.txt'
    data_file='D:/中国公路物流运价指数/202001011.csv'
    hyl=heyuling(url_file,data_file)  
    hyl.run()

在这里插入图片描述

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值