其中1 2 4条命令常用
scrapy startproject beike
scrapy genspider ershou https://wz.ke.com/ershoufang/
scrapy crawl ershou
yield关键字的使用——生成器(不断产生值的函数)
优势:
后两个参数只能有一个参数发挥作用
实战
spider代码
# -*- coding: utf-8 -*- import scrapy import re from scrapy import Spider, Request, FormRequest from ..items import HangyexinxiItem class WenzhouSpider(scrapy.Spider): name = 'wenzhou' start_urls = ['http://www.zhaoshang100.com/'] # def parse(self, response): # pass def start_requests(self): # url = 'http://wenzhou.cnlinfo.net/gongsi/' url = 'http://www.zhaoshang100.com/minglu162/' yield Request(url, callback=self.parse_index) # 获得所有分类的url def parse_index(self, response): gongsi = response.xpath('//ul[@class="proList02"]/li/a[@class="proName02"]/text()').extract() urls = response.xpath('//ul[@class="proList02"]/li/a[@class="proName02"]/@href').extract() item = HangyexinxiItem() for gongsi1, url in zip(gongsi, urls): item["company"] = gongsi1 item["url"] = 'http://www.zhaoshang100.com%s' % url yield Request(item["url"], meta={'item': item}, callback=self.parse_index2) yield item # 翻页信息 fanye = response.xpath('//div[@id="pager"]/a').extract() for f in fanye: if f.find('下页') >= 0: fenyeurl = re.findall(r'<a href="(.*?)">', f, re.S)[0] yield Request("http://www.zhaoshang100.com%s" % fenyeurl, callback=self.parse_index) def parse_index2(self, response): item = response.meta['item'] res1 = response.xpath('//div[@class="aiMain"]/ul').extract() item['people'] = re.findall(r'<i>联系人</i>(.*?)</li>', res1[0], re.S)[0] item['tel'] = re.findall(r'<i>联系电话</i>(.*?)</li>', res1[0], re.S)[0] item['phone'] = re.findall(r'<i>联系手机</i>(.*?)</li>', res1[0], re.S)[0] yewus = re.findall(r'<i>主营业务</i><a.*?>(.*?)</a> - <a.*?>(.*?)</a> - <.*?>(.*?)</a> - <a.*?>(.*?)</a> - <a.*?>(.*?)</a></li>', res1[0], re.S)[0] item['business'] = '{0}、{1}、{2}、{3}、{4}'.format(yewus[0], yewus[1], yewus[2], yewus[3], yewus[4]) return item
item代码
class HangyexinxiItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() company = scrapy.Field() url = scrapy.Field() people = scrapy.Field() tel = scrapy.Field() phone = scrapy.Field() business = scrapy.Field()
修改settings.py文件
如添加请求头信息:
DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' }