scrapy(5)zhaopin

最新推荐文章于 2023-04-04 15:10:37 发布

baiyan_er

最新推荐文章于 2023-04-04 15:10:37 发布

阅读量236

点赞数

分类专栏： python学习

本文链接：https://blog.csdn.net/baiyan_er/article/details/80252506

版权

python学习专栏收录该内容

20 篇文章 0 订阅

订阅专栏

打开文件:

(1)首先第一个蜘蛛:zhilian

找开网站搜“开发”我们要抓取的是职位名称，公司名称，职位月薪，工作地点

先简单的试验下看看抓出来是什么样子的

import scrapy


class ZhilianSpider(scrapy.Spider):
    name = 'zhilian'
    allowed_domains = ['zhaopin.com']
    start_urls = ['http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=%E5%BC%80%E5%8F%91&sm=0&p=1']

    def parse(self, response):
        datas=response.css('td.zwmc>div>a')
        for i in datas:
            print(i.css('::text').extract()

运行start.py文件

from scrapy.cmdline import execute
if __name__=='__main__':
    name=input("爬虫名")
    execute("scrapy crawl {name}".format(name=name).split())

接下来我们在改进用join：

打开第一个链接，在招聘职们信息中，所有的职位描述都要：

接下来到蜘蛛文件中的代码,zhilian.py中爬下来数据并处理

# -*- coding: utf-8 -*-
import scrapy
#re是导入的正则模块
import re
#导入的item模块
from zhaopin_demo.items import *


class ZhilianSpider(scrapy.Spider):
    name = 'zhilian'
    allowed_domains = ['zhaopin.com']
    start_urls = ['http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=%E5%BC%80%E5%8F%91&sm=0&p=1']
    #用来计数的变量
    count = 0
    def parse(self, response):
        datas=response.css('td.zwmc>div')
        for d in datas:
            #只获取链接
            a = d.xpath('./a[1]/@href').extract_first()
            #跟随爬虫
            yield response.follow(a,self.detail)
        next_page=response.css('a.next-page::attr("href")').extract_first()
        #如果想继续爬5页的内容继续加下边的代码
        if self.count<5:
            yield response.follow(next_page,self.parse())
            self.count+=1

    def detail(self,response):
        #爬取这个div中的内容
        zwmc = response.css('div.fl>h1::text').extract_first()  # 职位名称
        gsmc = response.css('div.fl>h2>a::text').extract_first()  # gsmc公司名称
        basic = response.css('ul.terminal-ul.clearfix')
        zwyx = basic.xpath('./li[1]/strong/text()').extract_first()  # 获取职位月薪
        gzdd = basic.xpath('./li[2]/strong/a/text()').extract_first()# 获取工作地点
        gwxq = response.css('div.tab-inner-cont').extract_first()
        # 用正则表达式过滤掉html标签
        p = '<[a-z][^/]*>|</[a-z\s\d]*>|\\n'
        gwxq = re.subn(p, "", gwxq)
        item = ZhaopinDemoItem()
        item['zwmc']=zwmc
        item['gsmc']=gsmc
        item['zwyx']=zwyx
        item['gzdd']=gzdd
        item['gwxq']=gwxq
        yield item#产生迭代结管做个管道来处理

        #print(a)
            #print("".join(i.css('::text').extract()))
            #print(i.extract_first())
            # text = i.extract()
            # #在这个文本中找，以a 开头，中间内容不变，都以target结尾
            # text = re.findall('<a .* target="_blank">(.*)</a>',text)
            # print(text)

接下来在pipelines.py中添加:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

#管道中创建kafk的生产者,传到kafka中broker接收
from kafka.producer.kafka import KafkaProducer
import json

class ZhaopinDemoPipeline(object):
    def open_spider(self,spider):
        #先得到kafka连接
        self.producer=KafkaProducer(bootstrap_servers="python2:9092")
    def close_spider(self,spider):
        self.producer.close()
    def process_item(self, item, spider):
        #如果传来一消息转换成字符串
        #msg=json.dumps(dict(item))
        msg=str(item)
        self.producer.send("cctv1",msg.encode())
        return item