Scrapy爬取拉勾网数据并存入本地文件和数据库

链接:https://pan.baidu.com/s/1yMM6DE-8RgVZjXEPHGimMQ 提取码:yyds

爬取代码: lg.py

import scrapy
from sjqx.items import SjqxItem


class Sjqxpider(scrapy.Spider):
    name = 'lg'
    allowed_domains = ['www.lagou.com']

    base_url = 'https://www.lagou.com/beijing-zhaopin/Python/'
    page = 1

    def __init__(self):
        self.headers = {
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "zh-CN,zh;q=0.9",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36",
        }

        tmp = "user_trace_token=20211116144103-285b4a8b-cf77-49af-bcc3-a1493b9ac075; _ga=GA1.2.1525634692.1637044883; LGUID=20211116144127-3b1b09bf-d6e5-4da7-b67f-2cdf37a54060; gate_login_token=8c6a5f90a68a54ae79afd8858424331b8c08483ffe6a9a3e0dc1f53e42c6a83b; LG_HAS_LOGIN=1; hasDeliver=0; privacyPolicyPopup=false; RECOMMEND_TIP=true; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; index_location_city=%E5%8C%97%E4%BA%AC; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1637044880,1637111963,1637635397; _gat=1; LGSID=20211123104319-59137906-a5b8-46b6-89bd-6c190379ad6d; PRE_UTM=m_cf_cpt_baidu_pcbt; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fother.php%3Fsc.K60000j28Sa2RWF0rhnSz-DlN8uZ4uFXSBtmAQWdjQRmJG%5FJVHlG3d7YYF4cPiMw0GNdeCsAuu3woK3mcji1sCIt8SpysS7U7E9zRlSaIlsODxV5yTUZJ%5FpnbFF-qEEuUdcIwUKcAvICIm0fBtvWtTsgXLSuscbiub3pkFmRuCAtAh2uxn%5FE4wdfNGcIDBr7hYWeNwW-2TzjHJuMHO06-IGdrshj.7Y%5FNR2Ar5Od663rj6tJQrGvKD77h24SU5WudF6ksswGuh9J4qt7jHzk8sHfGmYt%5FrE-9kYryqM764TTPqKi%5FnYQZHuukL0.TLFWgv-b5HDkrfK1ThPGujYknHb0THY0IAYqs2v4VnL30ZN1ugFxIZ-suHYs0A7bgLw4TARqnsKLULFb5TaV8UHPS0KzmLmqnfKdThkxpyfqnHR1nHD3n1fvn0KVINqGujYkPjRsPHbzr0KVgv-b5HDknH6vP1Td0AdYTAkxpyfqnHczP1n0TZuxpyfqn0KGuAnqiDFK0ZKGujYzPfKWpyfqnHbv0APzm1Y3Pjnz%26ck%3D3145.1.83.248.192.244.184.381%26dt%3D1637635396%26wd%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26tpl%3Dtpl%5F12273%5F25897%5F22126%26l%3D1531183460%26us%3DlinkName%253D%2525E6%2525A0%252587%2525E9%2525A2%252598-%2525E4%2525B8%2525BB%2525E6%2525A0%252587%2525E9%2525A2%252598%2526linkText%253D%2525E3%252580%252590%2525E6%25258B%252589%2525E5%25258B%2525BE%2525E6%25258B%25259B%2525E8%252581%252598%2525E3%252580%252591%2525E5%2525AE%252598%2525E6%252596%2525B9%2525E7%2525BD%252591%2525E7%2525AB%252599%252520-%252520%2525E4%2525BA%252592%2525E8%252581%252594%2525E7%2525BD%252591%2525E9%2525AB%252598%2525E8%252596%2525AA%2525E5%2525A5%2525BD%2525E5%2525B7%2525A5%2525E4%2525BD%25259C%2525EF%2525BC%25258C%2525E4%2525B8%25258A%2525E6%25258B%252589%2525E5%25258B%2525BE%21%2526linkType%253D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flanding-page%2Fpc%2Fsearch.html%3Futm%5Fsource%3Dm%5Fcf%5Fcpt%5Fbaidu%5Fpcbt; _putrc=29A9DFB8DFD1D0E6123F89F2B170EADC; JSESSIONID=ABAAABAABEIABCICF0A40B0EA71CD9CF3968BDA37BC6736; login=true; unick=%E7%94%A8%E6%88%B78914; WEBTJ-ID=20211123104328-17d4aad40998ca-0db7a8ed41b3ec-978183a-1327104-17d4aad409a44c; X_HTTP_TOKEN=f3a43640e6ad551b01453673610993f4cb3ba8435e; __SAFETY_CLOSE_TIME__23081889=1; _gid=GA1.2.1853327180.1637635408; sensorsdata2015session=%7B%7D; __lg_stoken__=1ccb553288981424baacaec2ab8e15417b46e5e0f9083c83d084ff93c43c14b1a6e72cae97b09bf5b6f4a122759e16b7415db744e058d09cc3210322104c46c4a074729c5f8c; SEARCH_ID=95c5b5e00e2a4bea983d5072fe38603c; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1637635442; LGRID=20211123104404-f48c9938-3fa6-4b3b-92e1-f85f21347fd9; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2223081889%22%2C%22first_id%22%3A%2217d2785705314e-0338c5cf8d6c37-57b1a33-1327104-17d27857054377%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2296.0.4664.45%22%2C%22lagou_company_id%22%3A%22%22%7D%2C%22%24device_id%22%3A%2217d2785705314e-0338c5cf8d6c37-57b1a33-1327104-17d27857054377%22%7D"
        cookie_list = tmp.split("; ")
        self.cookies = {cookie.split("=")[0]: cookie.split("=")[-1] for cookie in cookie_list}
        self.url = 'https://www.lagou.com/beijing-zhaopin/Python/?filterOption=3&sid=2b73cbfd62d04652945f522f73392f3b'

    def parse(self, response):
        li_list = response.xpath('//ul[@class ="item_con_list"]/li')
        for li in li_list:
            position = li.xpath('.//div[@class="p_top"]//h3/text()').extract_first().replace(" ","")
            place = li.xpath('.//div[@class="p_top"]//a//span//em/text()').extract_first().replace(" ","")
            salary = li.xpath('.//div[@class="p_bot"]//span/text()').extract_first().replace(" ","")
            requirement = li.xpath('.//div[@class="p_bot"]//text()').extract()[4].replace(" ","").replace("\n","").replace("/",",")
            company = li.xpath('.//div[@class="company"]//div[@class="company_name"]/a/text()').extract_first().replace(" ","")
            sjqx = SjqxItem(position=position, place=place, salary=salary, requirement=requirement, company=company)
            yield sjqx
        if self.page < 10:
            self.page = self.page + 1
            url = self.base_url + str(self.page) + '/?filterOption=3&sid=2b73cbfd62d04652945f522f73392f3b'
            # scrapy的get请求
            yield scrapy.Request(url=url, callback=self.parse)

    def start_requests(self):
        return [scrapy.Request(url=self.url, headers=self.headers, cookies=self.cookies, callback=self.parse)]

存入本地和数据库代码: pipelines.py,修改setting的通到配置

class SjqxPipeline:
    # Before
    def open_spider(self, spider):
        self.f = open('sjqx.json', 'w', encoding='utf-8')

    # After
    def close_spider(self, spider):
        self.f.close()

    # item 就是 yield返回的book
    def process_item(self, item, spider):
        # write必须是字符串
        self.f.write(str(item))

        return item

import pymysql
class SjqxMysqlPipeline:

    # Before
    def open_spider(self, spider):
        self.db = pymysql.connect(host='localhost', port=3306, user='root', passwd='123456', db='python', charset='utf8')
        self.cursor = self.db.cursor()

    # After
    def close_spider(self, spider):
        self.cursor.close()
        self.db.close()

    # item 就是 yield返回的book
    def process_item(self, item, spider):
        # write必须是字符串
        sql = "INSERT INTO `sjqx`(`place`,`company`,`position`,`salary`,`requirement`) values ('%s', '%s', '%s', '%s', '%s')" % (item['place'], item['company'], item['position'],item['salary'],item['requirement'])
        self.cursor.execute(sql)
        self.db.commit()
        return item

运行命令:进入spider目录执行

scrapy crawl lg

注意:cookie最好自己复制;修改数据库地址,先创建好表格

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
基于Python Scrapy实现的拉勾全站职位数据采集 爬虫系统 含数据库处理 # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy,re from scrapy.loader import ItemLoader from scrapy.loader.processors import MapCompose, TakeFirst from w3lib.html import remove_tags def extract_num(text): #从字符串中提取出数字 match_re = re.match(".*?(\d+).*", text) if match_re: nums = int(match_re.group(1)) else: nums = 0 return nums def replace_splash(value): '''去除/''' return value.replace("/", "") def handle_strip(value): '''空格''' return value.strip() def handle_jobaddr(value): '''去查看地图''' addr_list = value.split("\n") addr_list = [item.strip() for item in addr_list if item.strip() != "查看地图"] return "".join(addr_list) class LagouJobItemLoader(ItemLoader): #自定义itemloader default_output_processor = TakeFirst() class LagouJobItem(scrapy.Item): #拉勾职位 title = scrapy.Field() url = scrapy.Field() salary = scrapy.Field() job_city = scrapy.Field( input_processor=MapCompose(replace_splash), ) work_years = scrapy.Field( input_processor=MapCompose(replace_splash), ) degree_need = scrapy.Field( input_processor=MapCompose(replace_splash), ) job_type = scrapy.Field() publish_time = scrapy.Field() job_advantage = scrapy.Field() job_desc = scrapy.Field( input_processor=MapCompose(handle_strip), ) job_addr = scrapy.Field( input_processor=MapCompose(remove_tags, handle_jobaddr), ) company_name = scrapy.Field( input_processor=MapCompose(handle_strip), ) company_url = scrapy.Field() crawl_time = scrapy.Field() crawl_update_time = scrapy.Field() def get_insert_sql(self): insert_sql = """ insert into lagou_job(title, url, salary, job_city, work_years, degree_need, job_type, publish_time
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值