import requests
from bs4 import BeautifulSoup
import json
一、获取页面内容
def get_content(page):
base_url = 'https://hr.tencent.com/position.php?'
data = {'keywords': 'python',
'tid': 0,
'lid': 2196,
'start': page * 10,
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'}
soup = requests.get(url=base_url, params=data, headers=headers).text
return soup
二、提取数据
def get_data(soup):
# 1、bs4解析器
# ============转化成bs格式====================
bs_soup = BeautifulSoup(soup, 'lxml')
# 2、创建css选择器
tr_soup = bs_soup.select('tr')
# ==========去除无用的数据====================
tr_soup.pop(0)
tr_soup.pop()
tr_soup.pop()
# 3、提取数据并做格式化处理
# ==========遍历出tr列表中的每一行tr==========
item_list = []
for tr in tr_soup:
td_soup = tr.select('td')
item = {}
item['name'] = td_soup[0].string # 名称
item['href'] = 'https://hr.tencent.com/' + td_soup[0].a.attrs['href'] # 链接
item['catalog'] = td_soup[1].string #
item['num'] = td_soup[2].string
item['workLocation'] = td_soup[3].string
item['publishTime'] = td_soup[4].string
item_list.append(item)
data = json.dumps(item_list, ensure_ascii=False)
return data
if name==‘main’:
for page in range(5):
soup=get_content(page)
data=get_data(soup)
# =============保存数据========================
with open('./dir/tencent.txt', 'a', encoding='utf-8')as fp:
fp.write(data)
fp.write('\n')