爬取腾讯招聘岗位 scrapy框架

最新推荐文章于 2021-12-29 00:14:20 发布

帅哥大叔

最新推荐文章于 2021-12-29 00:14:20 发布

阅读量265

点赞数 1

本文链接：https://blog.csdn.net/qq_42535601/article/details/85330940

版权

1,腾讯岗位招聘

# -*- coding: utf-8 -*-

import scrapy

from lxml import etree

from ..items import TencentTtem

class TxSpider(scrapy.Spider):

name = 'tx'

allowed_domains = ['hr.tencent.com']

start_urls = []

url = 'https://hr.tencent.com/position.php?lid=&tid=&keywords=python&start=%d'

for page in range(55):

url1 = url % (page * 10)

start_urls.append(url1)

def parse(self, response):

content=response.body.decode('utf-8')

# with open('11111.html','w',encoding='utf-8')as fp:

# fp.write(content)

tree=etree.HTML(content)

tr_list=tree.xpath('//table[@class="tablelist"]/tr')

# print(tr_list)

tr_list.pop()

tr_list.pop(0)

for tr in tr_list:

item=TencentTtem()

##职位名称

name=tr.xpath('./td[1]/a/text()')

item['name']=name[0]

type=tr.xpath('./td[2]/text()')

item['type'] = type[0]

num=tr.xpath('./td[3]/text()')

item['num'] = num[0]

address=tr.xpath('./td[4]/text()')

item['address']=address[0]

time=tr.xpath('./td[5]/text()')

item['time']=time[0]

#详情页 url

deile=tr.xpath('./td[1]/a/@href')

item['deile']=deile[0]

##拼接域名

delai_url='http://hr.tencent.com/'+deile[0]

print(delai_url)

yield scrapy.Request(delai_url,callback=self.deile_parse,meta={'data':item},dont_filter=False)

def deile_parse(self, response):

content=response.body.decode('utf-8')

tree=etree.HTML(content)

item=response.meta['data']

##获取所有的内容

duty_list1=tree.xpath('//table[@class="tablelist textl"]/tr[3]//text()')

tr=''

for tr3 in duty_list1:

tr=tr + tr3.strip()

item['duty'] = tr

duty_list2=tree.xpath('//table[@class="tablelist textl"]/tr[4]//text()')

tr_4=''

for tr4 in duty_list2:

tr_4=tr_4+tr4.strip()

item['duty1']=tr_4

yield item

2.pipelines 管道部分

class TencentPipeline(object):

def __init__(self):

#建立连接

self.client = pymongo.MongoClient('localhost')

#创建库

self.db = self.client['Tencent']

self.table = self.db['Tencentjob']

def process_item(self, item, spider):

self.table.insert(dict(item))

return item

3，items 部分

class TencentTtem(scrapy.Item):

name = scrapy.Field()

type = scrapy.Field()

num = scrapy.Field()

address = scrapy.Field()

time = scrapy.Field()

deile = scrapy.Field()

duty= scrapy.Field()

duty1=scrapy.Field()

4.settings部分

# -*- coding: utf-8 -*-

# Scrapy settings for tengxu project

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

# https://doc.scrapy.org/en/latest/topics/settings.html

# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'tengxu'

SPIDER_MODULES = ['tengxu.spiders']

NEWSPIDER_MODULE = 'tengxu.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent

USER_AGENT = 'tengxu (+http://www.yourdomain.com)'

# Obey robots.txt rules

ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)

#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)

# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

#DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)

#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)

#TELNETCONSOLE_ENABLED = False

# Override the default request headers:

#DEFAULT_REQUEST_HEADERS = {

# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

# 'Accept-Language': 'en',

# Enable or disable spider middlewares

# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html

#SPIDER_MIDDLEWARES = {

# 'tengxu.middlewares.TengxuSpiderMiddleware': 543,

# Enable or disable downloader middlewares

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

# DOWNLOADER_MIDDLEWARES = {

# 'tengxu.mymiddleware.TengxuSpiderMiddleware': 543,

# 'tengxu.mymiddleware.ProxyMiddleware': 1,

# }

# Enable or disable extensions

# See https://doc.scrapy.org/en/latest/topics/extensions.html

#EXTENSIONS = {

# 'scrapy.extensions.telnet.TelnetConsole': None,

# Configure item pipelines

# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

ITEM_PIPELINES = {

'tengxu.pipelines.TencentPipeline': 300, ##

# 'tengxu.pipelines.MeijuPipeline': 300, ##美剧

}

# Enable and configure the AutoThrottle extension (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/autothrottle.html

#AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = 'httpcache'

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

##设置ip代理ip的设置

# PROXIES=[

# ]

LOG_FILE='tx.log' ##日志文件

LOG_ENABLED=True ##启用log

LOG_ENCODING='UTF-8'##编码

LOG_LEVEL='DEBUG'##日志登记

帅哥大叔

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫