打开文件:
(1)首先第一个蜘蛛:zhilian
找开网站搜“开发”我们要抓取的是职位名称,公司名称,职位月薪,工作地点
先简单的试验下看看抓出来是什么样子的
import scrapy
class ZhilianSpider(scrapy.Spider):
name = 'zhilian'
allowed_domains = ['zhaopin.com']
start_urls = ['http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=%E5%BC%80%E5%8F%91&sm=0&p=1']
def parse(self, response):
datas=response.css('td.zwmc>div>a')
for i in datas:
print(i.css('::text').extract()
运行start.py文件
from scrapy.cmdline import execute
if __name__=='__main__':
name=input("爬虫名")
execute("scrapy crawl {name}".format(name=name).split())
接下来我们在改进用join:
打开第一个链接,在招聘职们信息中,所有的职位描述都要:
接下来到蜘蛛文件中的代码,zhilian.py中爬下来数据并处理
# -*- coding: utf-8 -*-
import scrapy
#re是导入的正则模块
import re
#导入的item模块
from zhaopin_demo.items import *
class ZhilianSpider(scrapy.Spider):
name = 'zhilian'
allowed_domains = ['zhaopin.com']
start_urls = ['http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=%E5%BC%80%E5%8F%91&sm=0&p=1']
#用来计数的变量
count = 0
def parse(self, response):
datas=response.css('td.zwmc>div')
for d in datas:
#只获取链接
a = d.xpath('./a[1]/@href').extract_first()
#跟随爬虫
yield response.follow(a,self.detail)
next_page=response.css('a.next-page::attr("href")').extract_first()
#如果想继续爬5页的内容继续加下边的代码
if self.count<5:
yield response.follow(next_page,self.parse())
self.count+=1
def detail(self,response):
#爬取这个div中的内容
zwmc = response.css('div.fl>h1::text').extract_first() # 职位名称
gsmc = response.css('div.fl>h2>a::text').extract_first() # gsmc公司名称
basic = response.css('ul.terminal-ul.clearfix')
zwyx = basic.xpath('./li[1]/strong/text()').extract_first() # 获取职位月薪
gzdd = basic.xpath('./li[2]/strong/a/text()').extract_first()# 获取工作地点
gwxq = response.css('div.tab-inner-cont').extract_first()
# 用正则表达式过滤掉html标签
p = '<[a-z][^/]*>|</[a-z\s\d]*>|\\n'
gwxq = re.subn(p, "", gwxq)
item = ZhaopinDemoItem()
item['zwmc']=zwmc
item['gsmc']=gsmc
item['zwyx']=zwyx
item['gzdd']=gzdd
item['gwxq']=gwxq
yield item#产生迭代结管做个管道来处理
#print(a)
#print("".join(i.css('::text').extract()))
#print(i.extract_first())
# text = i.extract()
# #在这个文本中找,以a 开头,中间内容不变,都以target结尾
# text = re.findall('<a .* target="_blank">(.*)</a>',text)
# print(text)
接下来在pipelines.py中添加:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#管道中创建kafk的生产者,传到kafka中broker接收
from kafka.producer.kafka import KafkaProducer
import json
class ZhaopinDemoPipeline(object):
def open_spider(self,spider):
#先得到kafka连接
self.producer=KafkaProducer(bootstrap_servers="python2:9092")
def close_spider(self,spider):
self.producer.close()
def process_item(self, item, spider):
#如果传来一消息转换成字符串
#msg=json.dumps(dict(item))
msg=str(item)
self.producer.send("cctv1",msg.encode())
return item
末完待续