代码如下:
import urllib.request ##请求
import ssl
import re
import xlwt
import pymysql
ssl._create_default_https_context = ssl._create_unverified_context
##去爬取数据,返回的是HTML页面的内容
def getContent(name,j):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4503.5 Safari/537.36",
'Connection': 'keep-alive'
}
j = j+1
url = "https://search.51job.com/list/000000,000000,0000,00,9,99,%s,2,%d.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="%(name,j)
##请求对象(URL + 请求头)
req = urllib.request.Request(url, headers=headers)
##获取页面内容
page = urllib.request.urlopen(req).read()
##对获取的到内容,设置编码:防止中文乱码
page = page.decode("GBK")
return page
##使用正则找出 页面中 工作 相关的信息
def getItem(content):
pattern = re.compile(r'"job_href":"(.+?)","job_name":"(.+?)".+?"company_href":"(.+?)","company_name":"(.+?)","providesalary_text":"(.*?)".+?"workarea_text":"(.*?)","updatedate":"(.*?)".*?"companytype_text":"(.*?)","degreefrom":"(.*?)".*?"attribute_text":(.*?),"companysize_text":"(.*?)",.*?,"companyind_text":"(.*?)".*?')
res = re.findall(pattern,content)
return res
##将找出的信息存储在Excel表格中
def saveExcel(list):
##(2)工作簿
wb = xlwt.Workbook()
##(3)表
sheet = wb.add_sheet("数据分析50")
##(4)写数据:一行一行的写
header = ["公司的名字", "公司的网址","公司类型","公司规模","行业","工作地点","岗位名字", "待遇","岗位详情", "发布时间","学历","招聘要求"]
##表头
for (i,v) in enumerate(header):
sheet.write(0,i,v)
##(0岗位详情,1岗位名字,2公司的网址,3公司的名字,4待遇,5工作地点,6发布时间,7公司类型,8学历,9招聘要求,10公司规模,11行业)
for (i,tuple) in enumerate(list):
sheet.write(i + 1, 0, tuple[3])
sheet.write(i + 1, 1, tuple[2])
sheet.write(i + 1, 2, tuple[7])
sheet.write(i + 1, 3, tuple[10])
sheet.write(i + 1, 4, tuple[11])
sheet.write(i + 1, 5, tuple[5])
sheet.write(i + 1, 6, tuple[1])
sheet.write(i + 1, 7, tuple[4])
sheet.write(i + 1, 8, tuple[0])
sheet.write(i + 1, 9, tuple[6])
sheet.write(i + 1, 10, tuple[8])
sheet.write(i + 1, 11, tuple[9])
##保存
wb.save("51job2.xls")
list=[]
name = input("请输入您想要搜索的行业")
for j in range(0,201):
print("正在为您查询第%s页数据,请不要进行任何操作或退出程序。"%(j+1))
aaa = getContent(name,j)
content = getItem(aaa)
list.extend(content)
def saveMysql(list):
conn = pymysql.connect(host="localhost",
user="root",
password="123",
database="xmmysql",
charset="utf8")
cursor = conn.cursor() ##创建游标(新建查询会话),通过游标执行SQL语句
for i in list:
sql = "insert into sjfx(name,wz,leix,gm,hy,gzdd,gwmz,dy,gwxq,fbsj,xl,zpyq) values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')"%(i[0],i[1],i[2],i[3],i[4],i[5],i[6],i[7],i[8],i[9],i[10],i[11])
cursor.execute(sql) ##将SQL语句放入游标中,准备执行
conn.commit() ##提交
cursor.close()
conn.close()
# saveExcel(list)
# saveMysql(list)