本文从拉勾网上爬取Python爬虫岗位的职位信息和任职要求,并将数据保存到mysql数据库中,最后通过echarts模块实现数据可视化,直观地展示了这个职位的薪资、学历、工作经验和不同城市需求等信息。
author = 小朋友
微信 = qq735833020
关注作者微信公众号,了解更多详情,或获取代码
import requests
from lxml import etree
import time
from pymysql import connect
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
"Cookie": "TG-TRACK-CODE=search_code; JSESSIONID=ABAAABAABEEAAJA288CDC5B3C39F6B68B738BE0F3680D5B; SEARCH_ID=bec263de83e24d58938d14a4e56c7012",
"Referer": "https://www.lagou.com/jobs/list_Python%E7%88%AC%E8%99%AB?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=",
"Connection": "keep-alive"
}
conn = connect(host="127.0.0.1", port=3306, user="root", password="123456", database="lagou", charset="utf8")
cur = conn.cursor()
def get_index_info(pn):
index_url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
data = {
"first": "true",
"pn": pn,
"kd": "Python爬虫"
}
info_list = list()
print("正在爬取第{}页...".format(pn))
response = requests.post(index_url, headers=headers, data=data).json()
for item in (response["content"]["positionResult"]["result"]):
positionId = item["positionId"]
positionName = item["positionName"]
companyFullName = item["companyFullName"]
city = item["city"]
salary = item["salary"]
workYear = item["workYear"]
education = item["education"]
positionLables = ",".join(item["positionLables"]) if len(item["positionLables"]) > 0 else "None"
industryField = item["industryField"]
info_list.append((positionId, positionName, companyFullName, city, salary, workYear, education, positionLables,
industryField))
# print(info_list)
return info_list
def get_detail_info(index_info):
for item in index_info:
detail_url = "https://www.lagou.com/jobs/{}.html".format(item[0])
print(detail_url)
response = requests.get(detail_url, headers=headers).text
html = etree.HTML(response)
job_desc = "".join([item.strip() for item in html.xpath('//div[@class="job-detail"]//text()')])
# print(job_desc)
save_mysql(item, job_desc, detail_url)
def save_mysql(item, job_desc, detail_url):
try:
sql = "insert into job (id, positionName, companyFullName, city, salary, workYear, education, positionLables, industryField, detail_info, detail_url) values (default, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
cur.execute(sql, (item[1], item[2], item[3], item[4], item[5], item[6], item[7], item[8], job_desc, detail_url))
conn.commit()
except Exception as e:
print(e)
conn.rollback()
def main():
for pn in range(1, 24):
index_info = get_index_info(pn)
get_detail_info(index_info)
time.sleep(60)
cur.close()
conn.close()
if __name__ == '__main__':
main()