python从网页获取数据失败_求助,各位大神python爬虫对网页获取内容进行提取出现错误...

该楼层疑似违规已被系统折叠 隐藏此楼查看此楼

#爬取前程无忧的数据

import requests

from bs4 import BeautifulSoup

import pandas as pd

headers={

'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.3'

'6 (KHTML, like Gecko) Chrome/78.0.3904.116 Safari/537.36'

}

num = 10

data_list = []

#生成URL列表,需要查取得URL列表

def get_url_list(num):

url_list = []

for i in range(1,num+1):

url_list.append("https://search.51job.com/list/010000,000000,0000,00,9,99,python,2,{}.html?"

"lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&"

"companysize=99&ord_field=0&dibiaoid=0&line=&welfare=".format(i))

return url_list

def get_html(url):

req = requests.get(url,headers=headers)

req.encoding = 'gbk'

return req.text

#对获取的内容进行提取

def get_content(html):

soup = BeautifulSoup(html,"lxml",from_encoding='gbk')

body = soup.body

data_main = body.find("div",{"class":"dw_table"})

divs = data_main.find_all("div",{"class}":"el"})

for i,div in enumerate(divs):

if i ==0:

continue

temp = []

zwmc = div.find("p").find("a").get_text().strip()

gsmc = div.find("span",{"class":"t2"}).find("a").get_text().strip()

gzdd = div.find("span",{"class":"t3"}).get_text().strip()

xz = div.find("span",{"class":"t4"}).get_text().strip()

fbsj = div.find("span",{"class":"t5"}).get_text().strip()

temp.append(zwmc)

temp.append(gsmc)

temp.append(gzdd)

temp.append(xz)

temp.append(fbsj)

print(temp)

def get_content_totle_list(num):

url_list = get_url_list(num)

for url in url_list:

#html页面的文件内容

html = get_html(url)

data = get_content(html)

pass

def save_to_excel():

pass

def main(num):

get_content_totle_list(num)

save_to_excel()

if __name__ == '__main__':

main(num)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值