先通过西刺制作IP代理池,然后伪造USERAGENT 为安卓机或者
苹果机,通过筛选出的可用IP,登陆目标网页,本意是通过电脑抓
包投票的POST 的data和PARA来构造相应项。
但狗腾讯对PC端的限制,PC端只能浏览网页不能投票,不能抓取到
POST传递信息,本爬虫就只能做到构建代理池,通过代理池和安卓
或苹果的useragent访问网站,若有需要可爬取内容,但无法模拟
手机投票或者刷票
附上PYTHON代码,用到的requests包
# coding=utf-8
import urllib2
import random
import time
import requests
import re
class JB():
def __init__(self):
self.pro_g = []
print('赋值完成')
self.n1()
self.n2()
def n1(self):
a1={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
htm=requests.get('http://www.xicidaili.com/nt',headers=a1)
pattern = re.compile('<td class="country">.*? alt="Cn" /></td>.*?<td>(.*?)</td>.*?<td>(.*?)</td>',re.S)
items = re.findall(pattern, htm.text)
for item in items:
X=(item[0]+':'+item[1])
print X
self.pro_g.append(X)
print ('爬取IP完成个数:')
print(len(self.pro_g))
return self.pro_g
def n2(self):
print self.pro_g
a=0
a1={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64)'}
sx={'User-Agent':'Mozilla/5.0 (Linux; U; Android 2.3.6; zh-cn; GT-S5660 Build/GINGERBREAD) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1 MicroMessenger/4.5.255'}
for a in range(0, 80):
proxies_l = {'http': self.pro_g[a],
}
print(proxies_l['http'])
try:
req=requests.get('http://httpbin.org/ip',headers=a1,proxies=proxies_l)
print('可以用的IP')
print (req.text)
req2=requests.get('http://mp.weixin.qq.com/s/ep30umOyUNS03IvFSj7msg',headers=sx,proxies=proxies_l)
req2=req2.text
print('得到PAGE')
print(req2)
except:
print('no proxies')
sleep_time=random.randint(1, 3)
time.sleep(sleep_time)
print('Wait%ds'%sleep_time)
spider=JB()