首先得安装scrapy 和 pymongo
简单的安装和创建爬虫项目我们就简单的过一下
pip install scrapy
pip install pymongo
scrapy startproject sellsystem
在spiders目录下创建我们的爬虫文件
import copy
import scrapy
from ..items import SellItem
# 先下一页
class indexSpider(scrapy.Spider):
name = 'sell_finally'
all_province = []
start_urls = [
'http://b2b.huangye88.com/region/'
]
page = 1
def parse(self, response): # 入口程序
urls = response.xpath('//dl[@id="clist"]/dd/a/@href').extract()
for itm in urls:
print(itm)
print('111111111111')
yield scrapy.Request(itm, callback=self.parse_qu) # url
def parse_qu(self, response): # http://b2b.huangye88.com/anyang/
uurls = response.xpath('//*[@id="subarealist"]/div[2]/a/@href').extract()
for url in uurls:
print(url)
print('22222222222222')
yield scrapy.Request(url, callback=self.parse_instury_list) # url
def parse_instury_list(self, response): # 各种不同的行业
item = SellItem()
urls = response.xpath('//div[@class="tag_tx"]/ul/li/a/@href').extract()
privince = response.xpath('//div[@class="subNav"]/a[2]/text()').extract()[0][:-4] # 省
city = response.xpath('//div[@class="subNav"]/a[3]/text()').extract()[0][:-4] # 市
district = response.xpath('/html/body/div[3]/div[1]/text()').extract()[2] # 区
item['privince'] = privince # 省
item['city'] = city # 市
item['district'] = district[district.find('市') + 1:-6] # 区
for itm in urls:
print('33333333333333')
print(item)
yield scrapy.Request(itm, callback=self.parse_instury, meta={'item': copy.deepcopy(item)},dont_filter=T