0 创建项目
scrapy startproject tencent
1 创建一个新的爬虫
在项目的spider文件夹下,运行cmd并执行:scrapy genspider tencent “tencent.com”
2 确定目标,编写items.py
获取职位名称、详细信息、
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class TencentItem(scrapy.Item):
positionName = scrapy.Field()
workLocation = scrapy.Field()
publishTime = scrapy.Field()
3 编写tengcent.py(爬虫文件)
import scrapy
from tencent.items import TencentItem
class TengcentSpider(scrapy.Spider):
name = 'tengcent'
allowed_domains = ['tencent.com']
baseURL = 'http://hr.tencent.com/position.php?&start='
offset=0
start_urls=[baseURL+str(offset)]
def parse(self, response):
node_list=response.xpath("//a[@class='recruit-list-link']")
for node in node_list:
item=TencentItem()
#extract()将节点对象转换为unicode
positionName=node.xpath("./h4/text()").extract()[0]
publishTime = node.xpath("./p/span[last()]/text()").extract()[0]
workLocation = node.xpath("./p[@class='recruit-text]/text()").extract()[0]
item['positionName']=positionName.encode('utf-8')
item['workLocation']=workLocation.encode('utf-8')
item['publishTime'] = publishTime.encode('utf-8')
yield item#经过引擎传递给pipeline
###################翻页###################
if self.offset<50:
self.offset += 10
url = self.baseURL + self.offset
yield scrapy.Request(url,callback=self.parse())#定义请求:参数url,回调函数。经过引擎传递给调度器
4 编写pipeline并启用
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import json
class TencentPipeline:
def __init__(self):
self.f=open('tencent.json','w')
def process_item(self, item, spider):
content=json.dumps(dict(item),ensure_ascii=False)+',\n'
self.f.write(content)
return item
def close_spider(self,spider):
self.f.close()
5运行爬虫
scrapy crawl tengcent