爬取https://www.autoblog.com/used-list
这个页面的车辆信息,这是第一页,后面页的url如下添加:
urls=['https://www.autoblog.com/used-list']
for i in range(2,110):
urls.append('https://www.autoblog.com/used-list/page-%d'%(i))
我测试了下差不多100多页所以。
我的想法
将这些每个红框框作为一个新html加载到pyquery中再单独进行解析
def index_page(self, response):
for each in response.doc('article.l-article').items():
q=pq(each.html())
通过分析(css选择器)知道这些信息的元素的位置于是
def index_page(self, response):
for each in response.doc('article.l-article').items():
q=pq(each.html())
self.send_message(self.project_name,{"title":q('.title').find('a').text(), 'mileage':q('.val').eq(1).text(), 'color':q('.val').eq(3).text(), 'engine':q('.val').eq(4).text(), 'mpg':q('.val').eq(5).text(), 'location':q('.val').eq(6).text(),
"price":q('.r-title-price').text() },url=q('.title').find('a').text()+q('.val').eq(1).text()+q('.r-title-price').text()+q('.val').eq(3).text())#url去重
@config(priority=2)
def on_message(self, project,msg):
return msg
整体的代码就是这样
from pyspider.libs.base_handler import *
from pyquery import PyQuery as pq
urls=['https://www.autoblog.com/used-list']
for i in range(2,110):
urls.append('https://www.autoblog.com/used-list/page-%d'%(i))
class Handler(BaseHandler):
i=0
crawl_config = {
"headers":{'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299",
'Connection':"keep-alive"}
}
@every(minutes=24 * 60)
def on_start(self):
for url in urls:
self.crawl(url, callback=self.index_page)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('article.l-article').items():
q=pq(each.html())
self.send_message(self.project_name,{"title":q('.title').find('a').text(),
'mileage':q('.val').eq(1).text(),
'color':q('.val').eq(3).text(),
'engine':q('.val').eq(4).text(),
'mpg':q('.val').eq(5).text(),
'location':q('.val').eq(6).text(),
"price":q('.r-title-price').text()
},url=q('.title').find('a').text()+q('.val').eq(1).text()+q('.r-title-price').text()+q('.val').eq(3).text())
@config(priority=2)
def on_message(self, project,msg):
return msg
最后有些数据是不齐,通过正则表达式和csv操作就可以清洗干净,
想入门看文档是最快的方法。