爬虫代码:
#coding:utf-8
import scrapy
from seo2.items import Seo2Item
import urllib
import re
query = "手表回收"
def search(req,html):
text = re.search(req,html)
if text:
data = text.group(1)
else:
data = "no"
return data
class Dmozspider(scrapy.Spider):
name = "seo2"
start_urls = ['http://www.baidu.com/s?wd=%s' % query]
def __get_url_query(self,url):
m = re.search("wd=(.*)",url).group(1)
return m
def parse(self, response):
query = urllib.unquote(self.__get_url_query(response.url))
for id in xrange(1,11):
div = response.xpath("//*[@id='%s']" % id)
rank = div.xpath("@id").extract()[0]
title = re.sub("<[^>]*?>","",search("]*?>([\s\S]*?)",div.xpath("h3/a").extract()[0]))
lading = search(".*?
item = Seo2Item()
item['title'] = title
item['rank'] = rank
item['lading'] = lading
item['query'] = query
yield item
# print query,rank,title,lading
items代码
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class Seo2Item(scrapy.Item):
title = scrapy.Field()
query = scrapy.Field()
lading = scrapy.Field()
rank = scrapy.Field()
pass
分享到: