直接上代码
import requests
from bs4 import BeautifulSoup
import time
import pymysql
def get_info(URL):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0' }
try:
wb_data = requests.get(URL,headers=headers)
soup = BeautifulSoup(wb_data.content, 'html.parser')
NameList = soup.select('div.pos_base_info > span.pos_title')
name = NameList[0].text.strip()
LastList = soup.select('span.pos_name')
last = LastList[0].text.strip()
HyList = soup.select('p.comp_baseInfo_belong > a')
hy = HyList[0].text.strip()
GsList = soup.select('div.baseInfo_link > a')
gs = GsList[0].text.strip()
BqList = soup.select('div.pos_welfare > span.pos_welfare_item')
bq = BqList[0].text.strip()+'|'+BqList[1].text.strip()+'|'+BqList[2].text.strip()+'|'+BqList[3].text.strip()+'|'+BqList[4].text.strip()+'|'+BqList[5].text.strip()
YqList = soup.select('div.pos_base_condition > span.item_condition')
yq = YqList[0].text.strip()+'|'+YqList[1].text.strip()+'|'+YqList[2].text.strip()
JgList = soup.select('span.pos_salary')
jg = JgList[0].text.strip()
AddressList = soup.select('div.pos-area > span')
address = AddressList[0].text.strip()+'-'+AddressList[1].text.strip()
MsNameList = soup.select('div.des')
ms = MsNameList[0].text
JsList = soup.select('div.shiji > p')
js = JsList[0].text
job_all = [name,last,hy,gs,bq,yq,jg,address,ms,js]
print('数据获取成功!')
return job_all
except:
print('爬取错误')
def database_found():
conn = pymysql.connect(host='localhost', user="root", passwd="root")
cursor = conn.cursor()
cursor.execute('CREATE DATABASE IF NOT EXISTS 58_job DEFAULT CHARSET utf8;')
conn.select_db('58_job')
cursor = conn.cursor()
sql = """CREATE TABLE IF NOT EXISTS `job` (
`name` varchar(255),
`last` varchar(255),
`hy` varchar(255),
`gs` varchar(255),
`bq` varchar(255),
`yq` varchar(255),
`jg` varchar(255),
`address` varchar(255),
`ms` varchar(1000),
`js` varchar(1000)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=0"""
cursor.execute(sql)
cursor.close()
conn.close()
def save_page_data(data: list):
conn = pymysql.connect(host='localhost',user='root',password='root')
conn.select_db('58_job')
cur = conn.cursor()
sql_str = f'insert into job values ("'+data[0]+'","'+data[1]+'","'+data[2]+'","'+data[3]+'","'+data[4]+'","'+data[5]+'","'+data[6]+'","'+data[7]+'","'+data[8]+'","'+data[9]+'");'
if cur.execute(sql_str) == 0:
print('写入错误!')
else:
print('写入数据成功!')
cur.close()
conn.commit()
conn.close()
'''
下面是得到一个列表页的所有链接,然后单个请求页面并获取数据
'''
def get_all_info():
addr_code = '0d30364d-0015-60bd-6f5c-6b4f6a48df00'
page_num = '1'
start_time = time.time()
res = []
url = f'https://zz.58.com/yewu/pn{page_num}/?PGTID={addr_code}&ClickID=3'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0' }
try:
wb_data = requests.get(url,headers=headers)
soup = BeautifulSoup(wb_data.text, 'html.parser')
GetLink = soup.select('div.job_name > a')
except:
print('列表页信息被修改,所以爬取错误')
for i in GetLink:
link = i.get('href')
if 'yewu' in link:
res.append(get_info(link))
new_res = []
for i in res:
if i is not None:
new_res.append(i)
database_found()
all_num = len(new_res)
while new_res:
print(len(new_res))
save_page_data(new_res.pop())
print(f'成功爬取{all_num}条数据,共耗时{time.time() - start_time}s')
get_all_info()