该楼层疑似违规已被系统折叠 隐藏此楼查看此楼
#爬取前程无忧的数据
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.3'
'6 (KHTML, like Gecko) Chrome/78.0.3904.116 Safari/537.36'
}
num = 10
data_list = []
#生成URL列表,需要查取得URL列表
def get_url_list(num):
url_list = []
for i in range(1,num+1):
url_list.append("https://search.51job.com/list/010000,000000,0000,00,9,99,python,2,{}.html?"
"lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&"
"companysize=99&ord_field=0&dibiaoid=0&line=&welfare=".format(i))
return url_list
def get_html(url):
req = requests.get(url,headers=headers)
req.encoding = 'gbk'
return req.text
#对获取的内容进行提取
def get_content(html):
soup = BeautifulSoup(html,"lxml",from_encoding='gbk')
body = soup.body
data_main = body.find("div",{"class":"dw_table"})
divs = data_main.find_all("div",{"class}":"el"})
for i,div in enumerate(divs):
if i ==0:
continue
temp = []
zwmc = div.find("p").find("a").get_text().strip()
gsmc = div.find("span",{"class":"t2"}).find("a").get_text().strip()
gzdd = div.find("span",{"class":"t3"}).get_text().strip()
xz = div.find("span",{"class":"t4"}).get_text().strip()
fbsj = div.find("span",{"class":"t5"}).get_text().strip()
temp.append(zwmc)
temp.append(gsmc)
temp.append(gzdd)
temp.append(xz)
temp.append(fbsj)
print(temp)
def get_content_totle_list(num):
url_list = get_url_list(num)
for url in url_list:
#html页面的文件内容
html = get_html(url)
data = get_content(html)
pass
def save_to_excel():
pass
def main(num):
get_content_totle_list(num)
save_to_excel()
if __name__ == '__main__':
main(num)