1.加工和处理数据:处理成能够用于统计和分析的格式
jobAddress = item['jobAddress']
jobCity = ""
if '-' in jobAddress:
jobCity = jobAddress.split('-')[0] # ['苏州', '工业园区'] 苏州-工业园区
pass
else:
jobCity = jobAddress
pass
# 解析薪资数据
jobSalary = item['jobSalary']
minSalary = 0
maxSalary = 0
meanSalary = 0
# 万/月 千/月 万/年 元/天
if '万/月' in jobSalary: # 1-2万/月
salaryRange = jobSalary.split('万/月')[0] # ['1-2']
if '-' in salaryRange:
salary = salaryRange.split('-')
minSalary = float(salary[0]) * 10000 # 统一单位为元
maxSalary = float(salary[1]) * 10000
pass
else: # 有些数据项 2万/月
minSalary = maxSalary = float(salaryRange)*10000
meanSalary = (minSalary + maxSalary)/2
pass
if '千/月' in jobSalary: # 1-2千/月
salaryRange = jobSalary.split('千/月')[0] # ['1-2']
if '-' in salaryRange:
salary = salaryRange.split('-')
minSalary = float(salary[0]) * 1000 # 统一单位为元
maxSalary = float(salary[1]) * 1000
pass
else: # 有些数据项 2万/月
minSalary = maxSalary = float(salaryRange)*1000
meanSalary = (minSalary + maxSalary)/2
pass
if '万/年' in jobSalary: #
salaryRange = jobSalary.split('万/年')[0] # ['1-2']
if '-' in salaryRange:
salary = salaryRange.split('-')
minSalary = float(salary[0])/12 * 10000 # 统一单位为元
maxSalary = float(salary[1])/12 * 10000
pass
else: # 有些数据项 2万/月
minSalary = maxSalary = float(salaryRange)/12 * 10000
meanSalary = (minSalary + maxSalary) / 2
pass
if '元/天' in jobSalary: #
salaryRange = jobSalary.split('元/天')[0] # ['1-2']
if '-' in salaryRange:
salary = salaryRange.split('-')
minSalary = float(salary[0]) * 22 # 统一单位为元
maxSalary = float(salary[1]) * 22
pass
else: # 有些数据项 2万/月
minSalary = maxSalary = float(salaryRange) * 22
meanSalary = (minSalary + maxSalary) / 2
pass
2.将采集到的数据写入MySQL数据库,安装pymysql,pip install pymysql
将采集到的数据写入MySQL数据库,安装pymysql,pip install pymysql
# 建立数据库连接
connection = pymysql.connect(host='localhost', user='root', password="123456",
database='db_2020_jobinfo', port=3306, charset='utf8')
# connection.autocommit(True)
# 2.获得游标
cursor = connection.cursor()
# 3.执行SQL语句
result = cursor.execute("insert into t_job_data (jobName, jobCompany, jobAddress, jobSalary,jobDate"\
",jobCity,minSalary,maxSalary ,meanSalary, jobType)" \
"values (%s, %s, %s, %s, %s,%s, %s, %s, %s, %s)",
[item['jobName'],item['jobCompany'],item['jobAddress'],item['jobSalary'],item['jobDate']
,jobCity,minSalary,maxSalary, meanSalary, item['jobType']])
# 4.提交到数据库
connection.commit()
if result > 0:
print("写入成功")
3.修改item
import scrapy
class JobspidersssItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# jobName 将来会作为key
jobName = scrapy.Field()
jobCompany = scrapy.Field()
jobAddress = scrapy.Field()
jobSalary = scrapy.Field()
jobDate = scrapy.Field()
jobType = scrapy.Field()
pass
4.编写三个不同参数脚本
from scrapy.cmdline import execute
# 爬虫启动脚本,可以传参数
execute(['scrapy', 'crawl', 'jobspiders', '-a', 'jobType=Python',
'-a', 'url=https://search.51job.com/list/000000,000000,0000,00,9,99,Python,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='])
from scrapy.cmdline import execute
execute(['scrapy', 'crawl', 'jobspiders', '-a', 'jobType=人工智能',
'-a', 'url=https://search.51job.com/list/000000,000000,0000,00,9,99,%25E4%25BA%25BA%25E5%25B7%25A5%25E6%2599%25BA%25E8%2583%25BD,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='])
from scrapy.cmdline import execute
execute(['scrapy', 'crawl', 'jobspiders', '-a', 'jobType=Java',
'-a', 'url=https://search.51job.com/list/000000,000000,0000,00,9,99,Java,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='])
4.运行成功(我只运行了,两个脚本)。