分析:
1.通过chrome浏览器F12找到其ajax接口
2.通过post方法发送请求获取json格式数据
3.构造代理池进行爬取
1.代理池:
会浏览代理网页,然后测试每一个代理查看是否能够成功访问目标网页,如果成功则保存人D盘文本文件。
浏览快代理网页,遍历每一个代理,检测是否可用,一般有5个可用代理便可以开始启动爬虫程序了,之后爬虫程序和代理程序会一起运行,保证持续会有有效代理输入。
缺陷:代理池中的代理未必都可用,因为有些代理是测试的时候可用但之后不可用,有些代理是刚开始不可用后来可用。
import requests
from bs4 import BeautifulSoup
import time
class get_proxies(object):
def __init__(self):
#proxy_list=[]
self.headers={'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'channelid=0; sid=1574476990415840; _ga=GA1.2.1841802931.1574478582; _gid=GA1.2.556817404.1574478582; Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1573801017,1573805424,1574478582; Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1574478732',
'Host':' www.kuaidaili.com',
'Referer': 'https://www.kuaidaili.com/free/inha/6/',
'Upgrade-Insecure-Requests': 1,
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
self.test_headers={'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '105',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': 'clientlanguage=zh_CN; JSESSIONID=8348A6C6CADA87FF9ECF0A87DAB96AC4',
'Host': 'index.0256.cn',
'Origin': 'http://index.0256.cn',
'Referer': 'http://index.0256.cn/expx.htm',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'}
self.ip_score=IP_SCORE()
def get_page(self,url):
try:
response=requests.get(url,headers=self.headers)
response.raise_for_status()#判断是否获取response成功
return response.text
except:
print('无法获取代理页面'+url)
return ''
def kuaidaili(self,startpage,endpage):
start_url='https://www.kuaidaili.com/free/inha/{page}/'
for page in range(startpage,endpage):
url=start_url.format(page=page)
time.sleep(1)
print('获取代理页面'+url)
response=self.get_page(url)
if response:
soup=BeautifulSoup(response,'lxml')
trs=soup.tbody.find_all(name='tr')
for tr in trs:
ip=tr.find(attrs={'data-title':'IP'}).get_text()
port=tr.find(attrs={'data-title':'PORT'}).get_text()
proxy=ip+':'+port
yield proxy
def to_txt(self,proxy):
with open('D:/ip代理.txt','a',encoding='utf-8') as f:
f.write(proxy+'\n')
def test(self,proxy):
test_url='http://index.0256.cn/expcenter_trend.action'
proxy_host="http://" + proxy
proxy2={'http':proxy_host}
data={'marketId':1,'attribute1':1,'exponentTypeId':2,'cateId':2, 'attribute2':'华北','city':'','startLine':'吉林','endLine':'大连'}
try:
response=requests.post(test_url,data=data,headers=self.test_headers,proxies=proxy2,timeout=2)
response.raise_for_status()
print('获取有效代理:'+proxy_host)
self.ip_score.score(proxy,wl=1)
self.to_txt(proxy)
except:
pass
def main(self,startpage,endpage):
for proxy in self.kuaidaili(startpage,endpage):
self.test(proxy)
def run(self):
task1=threading.Thread(target=self.main,args=(1,300,))
task2=threading.Thread(target=self.main,args=(300,600,))
task3=threading.Thread(target=self.main,args=(600,900,))
task1.start()
time.sleep(3)
task2.start()
time.sleep(3)
task3.start()
获取代理如下所示:
2.为代理池设置分数系统:
get_all_ip 方法会首先读取保存于本地的全部ip代理,为每一个ip代理设置初始分数为20分,
score方法会为代理进行打分,假如该代理能够成功获取网页则代理分数加10分,假如不能获取网页,则分数减1分;
get_ip方法会将代理按评分进行排序,在分数最高的4个代理中随机选取一个进行调用
程序如下:
import random
class IP_SCORE(object):
def __init__(self):
self.ip_score={}
def score(self,ip,wl=0):
'''
为代理进行打分
wl==0:代表新代理,初始分20
wl==1:代表成功代理,加10分
不成功代理:减1分
'''
if wl==0:
self.ip_score[ip]=self.ip_score.get(ip,20)
elif wl==1:
self.ip_score[ip]=self.ip_score.get(ip,20)+10
else:
self.ip_score[ip]=self.ip_score.get(ip,20)-1
def get_ip(self):
'将代理按照分数进行排序,选取分数最高的4个代理进行调用'
ip=sorted(self.ip_score.items(), key=lambda e:e[1], reverse=True)
return random.choice(ip[:5])[0]
def get_all_ip(self):
with open('D:/ip代理.txt','r',encoding='utf-8') as f:
ips=f.readlines()
for ip in ips:
self.score(ip.strip('\n'),0)
3.爬取中国公路物流数据程序:
该程序会首先读取保存于本地的网址,然后利用process_url方法对网址进行解析,再通过get_response进行构造请求,get_response在调用过程中会使用上面介绍的IP_SCORE方法来获取评分较高的代理,再之后使用parse_response方法可以将获取的json数据进行解析,最后调用to_csv方法将数据保存进csv文件。
import json
import re
import time
import csv
import requests
import time
import random
import urllib.parse
from bs4 import BeautifulSoup
import threading
class get_proxies(object):
def __init__(self):
#proxy_list=[]
self.headers={'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'channelid=0; sid=1574476990415840; _ga=GA1.2.1841802931.1574478582; _gid=GA1.2.556817404.1574478582; Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1573801017,1573805424,1574478582; Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1574478732',
'Host':' www.kuaidaili.com',
'Referer': 'https://www.kuaidaili.com/free/inha/6/',
'Upgrade-Insecure-Requests': 1,
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
self.test_headers={'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '105',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': 'clientlanguage=zh_CN; JSESSIONID=8348A6C6CADA87FF9ECF0A87DAB96AC4',
'Host': 'index.0256.cn',
'Origin': 'http://index.0256.cn',
'Referer': 'http://index.0256.cn/expx.htm',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'}
self.ip_score=IP_SCORE()
def get_page(self,url):
try:
response=requests.get(url,headers=self.headers)
response.raise_for_status()#判断是否获取response成功
return response.text
except:
print('无法获取代理页面'+url)
return ''
def kuaidaili(self,startpage,endpage):
start_url='https://www.kuaidaili.com/free/inha/{page}/'
for page in range(startpage,endpage):
url=start_url.format(page=page)
time.sleep(1)
print('获取代理页面'+url)
response=self.get_page(url)
if response:
soup=BeautifulSoup(response,'lxml')
trs=soup.tbody.find_all(name='tr')
for tr in trs:
ip=tr.find(attrs={'data-title':'IP'}).get_text()
port=tr.find(attrs={'data-title':'PORT'}).get_text()
proxy=ip+':'+port
yield proxy
def to_txt(self,proxy):
with open('D:/ip代理.txt','a',encoding='utf-8') as f:
f.write(proxy+'\n')
def test(self,proxy):
test_url='http://index.0256.cn/expcenter_trend.action'
proxy_host="http://" + proxy
proxy2={'http':proxy_host}
data={'marketId':1,'attribute1':1,'exponentTypeId':2,'cateId':2, 'attribute2':'华北','city':'','startLine':'吉林','endLine':'大连'}
try:
response=requests.post(test_url,data=data,headers=self.test_headers,proxies=proxy2,timeout=2)
response.raise_for_status()
print('获取有效代理:'+proxy_host)
self.ip_score.score(proxy,wl=1)
self.to_txt(proxy)
except:
pass
def main(self,startpage,endpage):
for proxy in self.kuaidaili(startpage,endpage):
self.test(proxy)
def run(self):
task1=threading.Thread(target=self.main,args=(1,300,))
task2=threading.Thread(target=self.main,args=(300,600,))
task3=threading.Thread(target=self.main,args=(600,900,))
task4=threading.Thread(target=self.main,args=(1000,1300,))
#task5=threading.Thread(target=self.main,args=(1600,1900,))
task1.start()
time.sleep(3)
task2.start()
time.sleep(3)
task3.start()
time.sleep(3)
task4.start()
time.sleep(3)
task5.start()
class IP_SCORE(object):
def __init__(self):
self.ip_score={}
def score(self,ip,wl=0):
'''
为代理进行打分
wl==0:代表新代理,初始分20
wl==1:代表成功代理,加10分
不成功代理:减1分
'''
if wl==0:
self.ip_score[ip]=self.ip_score.get(ip,20)
elif wl==1:
self.ip_score[ip]=self.ip_score.get(ip,20)+10
else:
self.ip_score[ip]=self.ip_score.get(ip,20)-1
def get_ip(self):
ip=sorted(self.ip_score.items(), key=lambda e:e[1], reverse=True)
return random.choice(ip[:3])[0]
def get_all_ip(self):
with open('D:/ip代理.txt','r',encoding='utf-8') as f:
ips=f.readlines()
for ip in ips:
self.score(ip.strip('\n'),0)
class heyuling(object):
def __init__(self,url_filename,data_filename):
#self.data=data
self.url_filename=url_filename
self.data_filename=data_filename
self.url='http://index.0256.cn/expcenter_trend.action'
self.headers={'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '105',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': 'clientlanguage=zh_CN; JSESSIONID=8348A6C6CADA87FF9ECF0A87DAB96AC4',
'Host': 'index.0256.cn',
'Origin': 'http://index.0256.cn',
'Referer': 'http://index.0256.cn/expx.htm',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'}
self.ip_score=IP_SCORE()
self.get_proxies=get_proxies()
def get_response(self,url,data,headers,code):
#self.ip_score.get_all_ip()
ip=self.ip_score.get_ip()#获取经过评分后的ip
try:
proxy={'http':'http://'+ip}
response=requests.request("POST", url, data=data, headers=headers,timeout=2,proxies=proxy)
response.raise_for_status()
d=self.parse_response(response,code,self.url,data,self.headers)
self.ip_score.score(ip,1)
print('成功获取链接:{data}'.format(data=data))
return d
except:
print('获取链接失败:{data}'.format(data=data)+'...开始重试')
self.ip_score.score(ip,-1)
return self.get_response(url,data,headers,code)
def get_proxy(self):
#开始爬取代理
self.get_proxies.run()
def parse_response(self,response,code,url,data,headers):
inf=response.text
inf_json=json.loads(inf)
data={}
date=inf_json['chart1']['xLebal'][-1]
timearray=time.strptime(date,'%Y-%m-%d')
data['date']=time.strftime('%Y%m%d',timearray)
data['value']=inf_json['chart1']['yLebal'][-1]
data['code']=code
return data
def to_csv(self,filename,data):
with open(filename,'a',encoding='utf-8',newline='') as f:
writer=csv.writer(f)
writer.writerow([data['code'],data['date'],data['value']])
def get_urls(self,filename):
with open(filename,'r',encoding='gbk') as f:
urls=f.readlines()
for u in urls:
url=u.split('\t')[0]
code=u.split('\t')[1].strip()
yield [url,code]
def process_url(self,url):
'''
从文本文件里读取网址并解析成url格式
'''
repl=lambda matched :urllib.parse.quote(matched.group(0))
new_url=re.sub('[\u4e00-\u9fa5]+',repl,url)
return new_url
def main(self):
infs=self.get_urls(self.url_filename)
self.ip_score.get_all_ip()
num=0
for inf in infs:
url=inf[0]
code=inf[1]
num+=1
data=self.process_url(url)
d=self.get_response(self.url,data,self.headers,code)
print('开始保存第{n}条数据'.format(n=num))
self.to_csv(self.data_filename,d)
def run(self):
SpiderThread=threading.Thread(target=self.main,args=())
IPThread=threading.Thread(target=self.get_proxy,args=())
SpiderThread.start()
IPThread.start()
SpiderThread.join()
IPThread.join()
if __name__=='__main__':
#urlfile=input('请输入网址所在的文件')
#datafile=input('请输入文件保存位置')
url_file='D:/中国公路物流运价指数/URL月.txt'
data_file='D:/中国公路物流运价指数/202001011.csv'
hyl=heyuling(url_file,data_file)
hyl.run()