# -*- coding:utf-8 -*-
#使用beautifulsoup方法抓取网页数据
import chardet
from bs4 import BeautifulSoup
import requests
url = 'https://search.51job.com/list/020000,000000,0000,00,9,99,python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
r = requests.get(url)
code = chardet.detect(r.content)['encoding']
r.encoding = code
soup = BeautifulSoup(r.text,'html.parser')
# print(soup)
#先找到父元素
parentDiv = soup.find('div',attrs={'id':'resultList'}) #一定要先找到id的位置
# print(type(parentDiv)) #<class 'bs4.element.Tag'>
divs = parentDiv.find_all('div',attrs={'class':'el'})
# print(divs) #<class 'bs4.element.ResultSet'>
divs.pop(0)
dataList = []
for each in divs:
data = []
# e1 = each.find('p').find('a').string.strip() #方法一
# e1 = each.find('p').find('a')['title'] #方法二
e1 = each.find('p').find('a').get('title') #方法三
data.append(e1)
e2 = each.find('span',attrs = {'class':'t2'}).find('a').string
data.append(e2)
e3 = each.find('span',attrs = {'class':'t3'}).string
data.append(e3)
e4 = each.find('span',attrs = {'class':'t4'}).string
data.append(e4)
e5 = each.find('span',attrs = {'class':'t5'}).string
data.append(e5)
dataList.append(data)
print(dataList)
import xlwt
wbk = xlwt.Workbook()
sheet = wbk.add_sheet('python')
for i,each in enumerate(dataList):
for j,value in enumerate(each):
sheet.write(i,j,value)
wbk.save('python.xls')
使用beautifulsoup方法抓取51job网页数据
最新推荐文章于 2024-07-09 14:45:02 发布