继续老套路,这两天我爬取了猪八戒上的一些数据 网址是:http://task.zbj.com/t-ppsj/p1s5.html
,可能是由于爬取的数据量有点多吧,结果我的IP被封了,需要自己手动来验证解封ip,但这显然阻止了我爬取更多的数据了。
下面是我写的爬取猪八戒的被封IP的代码
# coding=utf-8
import requests
from lxml import etree
def getUrl():
for i in range(33):
url = 'http://task.zbj.com/t-ppsj/p{}s5.html'.format(i+1)
spiderPage(url)
def spiderPage(url):
if url is None:
return None
htmlText = requests.get(url).text
selector = etree.HTML(htmlText)
tds = selector.xpath('//*[@class="tab-switch tab-progress"]/table/tr')
try:
for td in tds:
price = td.xpath('./td/p/em/text()')
href = td.xpath('./td/p/a/@href')
title = td.xpath('./td/p/a/text()')
subTitle = td.xpath('./td/p/text()')
deadline = td.xpath('./td/span/text()')
price = price[0] if len(price)>0 else '' # python的三目运算 :为真时的结果 if 判定条件 else 为假时的结果
title = title[0] if len(title)>0 else ''
href = href[0] if len(href)>0 else ''
subTitle = subTitle[0] if len(subTitle)>0 else ''
deadline = deadline[0] if len(deadline)>0 else ''
print price,title,href,subTitle,deadline
print '---------------------------------------------------------------------------------------'
spiderDetail(href)
except:
print '出错'
def spiderDetail(url):
if url is None:
return None
try: