python3爬取51job加可视化
效果如下
多进程爬取51job数据插入到mysql中
#____author:"xie"
#date:2020-10-19
# -*- coding: utf-8 -*-
import requests
import time,re
import pymysql
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from multiprocessing import Pool
from urllib import parse
time_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
req = requests.Session()
req.mount('http://', HTTPAdapter(max_retries=3))
req.mount('https://', HTTPAdapter(max_retries=3))
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Host': 'search.51job.com',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,ru;q=0.7',
'Upgrade-Insecure-Requests': '1',
'Cookie':'partner=www_baidu_com; guid=dee465ee11cc6ecc88e03ab6df40d83f; 51job=cenglish%3D0%26%7C%26; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; search=jobarea%7E%60040000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FApython%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21'
}
def file(url):
job_name = re.findall(r'https://search.51job.com/list/040000,000000,0000,00,9,99,(.*?),', str(url), re.I)[0]
job_name = parse.unquote(parse.unquote(job_name))
data_row = {'rows': []}
Current = 1
Total = 1
while Current <= Total:
# url = 'https://search.51job.com/list/040000,000000,0000,00,9,99,{},2,{}.html'.format(search, Current)
response = req.get(url=url, headers=headers, timeout=20)
print('当前第{}页,url是:{}'.format(Current, url))
req.close()
data_file = []
soup = BeautifulSoup(response.text, 'html.parser')
list_url = soup.find_all(type="text/javascript")
list_url = re.findall(r'"job_href":"(.*?)"', str(list_url), re.I)
for i in list_url:
url = str(i).replace(r'\/', '/')
try:
response = req.get(url=url, headers=headers, timeout=20)
req.close()
time.sleep(0.5)
file = response.content.decode("gbk")
soup = BeautifulSoup(file, 'html.parser')
position = re.findall(r'<h1 title="(.*?)">', file, re.I)[0]
salary = re.findall(r'</h1><strong>(.*?)</strong>', file, re.I)[0]
company = soup.find_all('p', class_='cname')
company_name = re.findall(r'title="(.*?)"', str(company), re.I)[0]
msg_type = soup.find_all('p',class_='msg ltype')[0]
msg_type = re.findall(r'title="(.*?)</p>', str(msg_type), re.I)[0]
msg_type = str(msg_type).replace('<span>|</span>','').replace('|','').split('">')[-1].split()
jtag = soup.find_all('span', class_='sp4')
jtags = []
for i in jtag:
jtags.append(i.string)
job_msgs = []
job_msg = soup.find_all('div', class_='bmsg job_msg inbox')[0]
for i in job_msg:
if i.string == None:
continue
job_msgs.append(i.string)
file_sql(company_name, position, salary, msg_type, jtags, job_msgs,url,job_name)
except Exception as e:
print(e)
Current += 1
def file_sql(company_name, position, salary, msg_type, jtags, job_msgs,url,job_name):
db = pymysql.connect("10.147.17.10","root","6340563","51job" )
job_msgs = str(job_msgs).replace('[','').replace(']','').replace(r"', '",'')
msg_type = str(msg_type).replace(r"'",'').replace('[','').replace(']','')
jtags = str(jtags).replace(r"'",'').replace('[','').replace(']','')
cursor = db.cursor()
print('正在插入:{} 信息'.format(company_name))
sql = "INSERT INTO 51_data(company_name, position, salary, msg_type, jtags, job_msgs,51_url,job_name)" \
"VALUES('{}','{}','{}','{}','{}',{},'{}','{}');"\
.format(company_name, position, salary, msg_type, jtags, job_msgs,url,job_name, url, job_name)
try:
cursor.execute(sql)
db.commit()
except:
db.rollback()
db.close()
if __name__ == '__main__':
HomePage = 'https://search.51job.com/'
search = parse.quote(parse.quote(input('请输入搜索词:')))
url = HomePage+'list/040000,000000,0000,00,9,99,{},2,1.html'.format(search)
response = req.get(url=url, headers=headers, timeout=20)
total = re.findall(r'"total_page":"(\d+)"', response.text, re.I)
total = int(total[0])
pool = Pool(processes=10)
tot_page = []
for i in range(1, total):
url = HomePage+'list/040000,000000,0000,00,9,99,{},2,{}.html'.format(search,i)
tot_page.append(url)
pool.map(file, tot_page) # 多线程工作
pool.close()
pool.join()
print('爬取完成!')
通过flask控制前端展示数据
效果
项目下载链接:
https://download.csdn.net/download/ranranran52/13104414
flask运行成功后地址是https://127.0.0.1:5000