教程演示
本次演示的是爬取前程无忧网站,因为该网站几乎不存在防爬措施,对爬虫练习十分友好
创建爬虫项目
scrapy startproject qianchengwuyou
cd qianchengwuyou
scrapy genspider wuyou jobs.51job.com
编写需要爬取的字段(items.py)
import scrapy
class QianchengwuyouItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 职位名称
Job_title = scrapy.Field()
# 薪资水平
Pay_level = scrapy.Field()
# 招聘单位
Recruitment_unit = scrapy.Field()
# 工作地点
Workplace = scrapy.Field()
# 工作经验
hands_background = scrapy.Field()
# 学历要求
Education_requirements = scrapy.Field()
# 职位信息(工作内容+任职要求+工作经验)
Career_information = scrapy.Field()
# 关键字:keyword
keyword = scrapy.Field()
# 日期
day = scrapy.Field()
编写spider文件(wuyou.py)
import scrapy
import re
from ..items import QianchengwuyouItem
class WuyouSpider(scrapy.Spider):
name = 'wuyou'
allowed_domains = ['search.51job.com','jobs.51job.com']
start_urls = ['https://search.51job.com/list/000000,000000,0000,00,3,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=1&dibiaoid=0&line=&welfare=']
# for a in range(1,800):
# print('正在爬取第' + str(a) + '页')
# url = ('https://search.51job.com/list/000000,000000,0000,00,3,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=1&dibiaoid=0&line=&welfare=')
# start_urls.append(url)
def parse(self, response):
# 获取全部招聘职位的链接
all_list = response.xpath('//*[@id="resultList"]//div[@class="el"]')
# 获取全部招聘职位下的所有行业(industry)链接
for b in all_list:
all_url = b.xpath('./p/span/a/@href').extract_first()
yield scrapy.Request(
all_url,
callback=self.parse_details
)
print(all_url)
next_url = response.xpath("//*[@id='resultList']//div[@class='p_in']//li/a[text()='下一页']/@href").extract_first()
if next_url is not None:
yield scrapy.Request(
next_url,
callback=self.parse
)
elif next_url is None:
print("爬虫结束!")
# selectors = response.xpath('//div[@class="el"]')
# for selector in selectors:
# # 详细页网址
# post = selector.xpath('./p/span/a/text()').get(default='000 ').replace('\r\n', '')
# day = selector.xpath('./span[4]/text()').get(default='000 ')
# print(post,day)
def parse_details(self, response):
# 获取详情页面数据
print("=" * 100)
print(response.url)
print("正在爬取数据!")
item = QianchengwuyouItem()
# 职位名称
item["Job_title"] = response.xpath("//div[@class='cn']/h1/text()").extract_first()
item["Job_title"] = [i.strip() for i in item["Job_title"]]
item["Job_title"] = [i for i in item["Job_title"] if len(i) > 0]
item["Job_title"] = "".join(item["Job_title"]).replace(",", ",")
# 薪资水平
item["Pay_level"] = response.xpath("//div[@class='cn']/strong/text()").extract_first()
# 招聘单位
item["Recruitment_unit"] = response.xpath("//div[@class='cn']//a[1]/text()").extract_first()
# 工作地点 + 工作经验 + 学历要求....都在//div[@class='cn']/p[2]中
item["Workplace"] = response.xpath("//div[@class='cn']/p[2]/text()[1]").get().replace('\xa0','')
# 工作经验 + 学历要求
item["hands_background"] = response.xpath("//div[@class='cn']/p[2]/text()").extract()
item["hands_background"] = [i.strip() for i in item["hands_background"]]
item["hands_background"] = [i for i in item["hands_background"] if "经验" in i]
item["hands_background"] = " ".join(item["hands_background"]).replace("\xa0", "")
if len(item["hands_background"]) == 0:
item["hands_background"] = "无"
all = response.xpath("//div[@class='cn']/p[2]/text()[2]").get().replace('\xa0','')
# 判断工作经验是否存在
if len(all) >= 4:
item["Education_requirements"] = response.xpath("//div[@class='cn']/p[2]/text()[3]").get().replace('\xa0','')
if len(item["Education_requirements"]) != 2:
item["Education_requirements"] = "无"
elif len(all) < 4:
item["Education_requirements"] = "无"
elif len(all) == 2:
item["Education_requirements"] = all
item["Career_information"] = response.xpath("//div[@class='bmsg job_msg inbox']//text()").extract()
item["Career_information"] = [i.strip() for i in item["Career_information"]]
item["Career_information"] = [i for i in item["Career_information"] if len(i) > 0]
item["Career_information"] = " ".join(item["Career_information"]).replace("\xa0","").replace(",",",")
if (item["Pay_level"]) is None:
item["Pay_level"] = "无"
# 关键字:keyword
item["keyword"] = response.xpath("//div[@class='mt10']//p//a/text()").extract()
# item["keyword"] = [i.strip() for i in item["keyword"]]
item["keyword"] = [i for i in item["keyword"] if len(i) > 0]
item["keyword"] = " ".join(item["keyword"]).replace("\xa0", "").replace(",", ",")
# 日期
item["day"] = response.xpath("//div[@class='cn']/p[2]/@title").get().replace("\xa0","")
riqi = re.findall("(\d+-\d+)发布", response.text)[0]
item["day"] = riqi
yield item
print("数据爬取成功!")
编写数据库连接(pipelines.py)
from pymongo import MongoClient
class QianchengwuyouPipeline:
def open_spider(self, spider):
self.db = MongoClient("localhost", 27017).qiancheng # 创建数据库yc
self.collection = self.db.liuli05 # 创建一个集合
def process_item(self, item, spider):
# 添加数据到jingjiren表中
self.collection.insert_one(dict(item))
return item
def close_spider(self, spider):
self.collection.close()
编写反爬措施(settings.py)
BOT_NAME = 'qianchengwuyou'
SPIDER_MODULES = ['qianchengwuyou.spiders']
NEWSPIDER_MODULE = 'qianchengwuyou.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 ' \
'Safari/537.36 Edg/83.0.478.58 '
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
FEED_EXPORT_ENCODING = 'utf-8'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# # 启用pipeline
ITEM_PIPELINES = {
'qianchengwuyou.pipelines.QianchengwuyouPipeline':300,
}
如果运行没问题的话,mongosb数据库应该会有数据
MongoDB数据库的安装
软件下载
mongodb:链接: https://pan.baidu.com/s/1Yf_7g4kSY_wvWde78wlzpQ 提取码: f6se
安装教程
数据测试
本人已经爬取了一部分数据,分享出来供大家测试
链接: https://pan.baidu.com/s/1ymL917_etE-5Fn2HBuWHFQ 提取码: 5x5k
配置Linux环境
在这里本人使用的资源都会发布出来
集合:链接: https://pan.baidu.com/s/1KeFcPVKdlWeJHJIT6VemAg 提取码: pk3v
为了方便大家的测试,配置好的虚拟机我也会分享出来
ps:我搭建的完全分布式,共三台电脑,我发布的是主机,另外两台傀儡机大家从主机复制两份就可以了,改主机名和ip地址就可以了
链接: https://pan.baidu.com/s/1sIPW_D0JuvxfXLXJP7hNqQ 提取码: 9hdg
启动Hadoop集群
hadoop01:start-dfs.sh
hadoop02:start-yarn.sh
数据储存
具体要求:将爬取的数据存储到hdfs上。利用flume收集日志
1:将mongodb数据库的爬虫数据导入出来存为csv或者txt文件
如果是csv文件可以用python代码写
import pymongo
import csv
mongo_url = "localhost:27017"
DATABASE = "xxxxx" #数据库名称
TABLE = "xxxx" #集合名(表名)
client = pymongo.MongoClient(mongo_url)
db_des = client[DATABASE]
db_des_table = db_des[TABLE]
# newline='' 的作用是防止结果数据中出现空行,专属于python3
with open(f"{DATABASE}_{TABLE}.csv", "w", newline='',encoding="utf-8") as csvfileWriter:
writer = csv.writer(csvfileWriter)
fieldList = [
"Job_title",
"Pay_level",
"Recruitment_unit",
"Workplace",
"hands_background",
"Education_requirements",
"Career_information",
"keyword",
"day"
]
writer.writerow(fieldList)
allRecordRes = db_des_table.find()
# 写入多行数据
for record in allRecordRes:
# print(f"record = {record}")
recordValueLst = []
for field in fieldList:
if field not in record:
recordValueLst.append("None")
else:
recordValueLst.append(record[field])
try:
writer.writerow(recordValueLst)
except Exception as e:
print(f"write csv exception. e = {e}")
print("数据导出成功!")
但是为了后面用flume收集数据则txt文件比较好
txt文件可以用Navicat Premium 15导出
Navicat Premium 15破解软件
链接: https://pan.baidu.com/s/1_SA8xEemn0a5Vvp7CQ870g 提取码: 5fb9 复制这段内容后打开百度网盘手机App,操作更方便哦
flume配置文件
# The configuration file needs to define the sources,
# the channels and the sinks.
# Sources, channels and sinks are defined per agent,
# in this case called 'agent'
# 定义别名
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# 描述/配置源source
a1.sources.r1.type = spooldir
# 设置监控的文件夹
# 扫描文件之后修改文件后缀
a1.sources.r1.spoolDir = /usr/qianchengwuyou
# 上传成功后显示后缀名
a1.sources.r1.fileSuffix = .COMPLETED
# 如论如何 加绝对路径的文件名 默认false
a1.sources.r1.fileHeader = true
# #忽略所有以.tmp结尾的文件,不上传
a1.sources.r1.ignorePattern = ([^ ]*\.tmp)
#开启日志长度截取标志,默认true,开启
a1.sources.tail.interceptors.i2.cutFlag = true
#最大截取字符串长度,整数,尽量控制在2M以内,单位:kb,1M=1024
a1.sources.tail.interceptors.i2.cutMax = 2048
#
# # 描述一下sink 下沉到hdfs
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /Hadoop/flume/%y-%m-%d/%H-%M/
# #上传文件的前缀
a1.sinks.k1.hdfs.filePrefix = pachong
# #是否按照时间滚动文件夹
a1.sinks.k1.hdfs.round = true
# #多少时间单位创建一个新的文件夹
a1.sinks.k1.hdfs.roundValue = 1
# #重新定义时间单位
a1.sinks.k1.hdfs.roundUnit = hour
# #是否使用本地时间戳
a1.sinks.k1.hdfs.useLocalTimeStamp = true
# #积攒多少个Event才flush到HDFS一次
a1.sinks.k1.hdfs.batchSize = 10
# #设置文件类型,可支持压缩
a1.sinks.k1.hd