使用beautifulsoup方法抓取51job网页数据

最新推荐文章于 2024-07-09 14:45:02 发布

pennyyangpei

最新推荐文章于 2024-07-09 14:45:02 发布

阅读量1.2k

点赞数 1

本文链接：https://blog.csdn.net/qq_42379006/article/details/80622681

版权

# -*- coding:utf-8 -*-
#使用beautifulsoup方法抓取网页数据
import chardet
from bs4 import BeautifulSoup
import requests

url = 'https://search.51job.com/list/020000,000000,0000,00,9,99,python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
r = requests.get(url)
code = chardet.detect(r.content)['encoding']
r.encoding = code
soup = BeautifulSoup(r.text,'html.parser')
# print(soup)
#先找到父元素
parentDiv = soup.find('div',attrs={'id':'resultList'})   #一定要先找到id的位置
# print(type(parentDiv))    #<class 'bs4.element.Tag'>
divs = parentDiv.find_all('div',attrs={'class':'el'})
# print(divs)    #<class 'bs4.element.ResultSet'>
divs.pop(0)
dataList = []
for each in divs:
    data = []
    # e1 = each.find('p').find('a').string.strip()  #方法一
    # e1 = each.find('p').find('a')['title']     #方法二
    e1 = each.find('p').find('a').get('title')  #方法三
    data.append(e1)
    e2 = each.find('span',attrs = {'class':'t2'}).find('a').string
    data.append(e2)
    e3 = each.find('span',attrs = {'class':'t3'}).string
    data.append(e3)
    e4 = each.find('span',attrs = {'class':'t4'}).string
    data.append(e4)
    e5 = each.find('span',attrs = {'class':'t5'}).string
    data.append(e5)
    dataList.append(data)

print(dataList)
import xlwt
wbk = xlwt.Workbook()
sheet = wbk.add_sheet('python')
for i,each in enumerate(dataList):
    for j,value in enumerate(each):
        sheet.write(i,j,value)
wbk.save('python.xls')