爬虫(1)-爬取腾讯社招信息
1.获取网页信息
思想:通过requests包,发送请求并获取数据
通过对腾讯社招页面研究发现,其采用了动态方式加载表格数据,因此,可以直接通过其json接口,获取社招信息,并通过进一步的url中的postId获取后方详细页面信息(同样是通过json接口获取信息)。
具体Python代码可以分为几个部分:
因为存入数据库的原因及需要,首先先根据CategoryId进行分类下载数据,其中要拼接一下url以获得不同类别的数据。
# 按照category分类下载
def category_dis():
# url = "https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1640351123846&countryId=&cityId=&bgIds" \
# "=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex=1&pageSize=10&language=zh-cn&area=cn"
list_categoryId = [40005001, 40005002, 40006, 40007, 40008, 40009, 40010, 40011]
for i in list_categoryId:
url = "https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1640351123846&countryId=&cityId=&bgIds" \
"=&productId=&categoryId={0}&parentCategoryId=&attrId=&keyword=&pageIndex=1&pageSize=5000&language=zh-cn&area=cn".format(
i)
download(url, str(i))
print(f"完成{i}的存储")
之后就是具体的下载的部分代码
获取页面信息,及存入数据库
# 通过url下载数据(按category分类)
def download(url, categoryid):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0"}
# url = "https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1640160325881&countryId=&cityId=&bgIds" \
# "=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex=1&pageSize=9500&language=zh-cn&area=cn "
# 获取数据
# https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1640160325881&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex=1&pageSize=10&language=zh-cn&area=cn
print(url)
html = requests.get(url, headers=headers).text
# 通过json加载数据
data = json.loads(html)
data_list = data['Data']['Posts']
# # 保存数据到Excel表格
# wb = openpyxl.Workbook() # 创建工作蒲
# ws = wb.active # 激活工作表
# # 指定单元格写入数据的形式
# ws['A1'] = '岗位名称'
# ws['B1'] = '国家'
# ws['C1'] = '城市'
# ws['D1'] = '背景名称'
# ws['E1'] = '类别'
# ws['F1'] = '职责'
# ws['G1'] = '发布时间'
# ws['H1'] = '详细地址'
# for i in data_list:
# # 添加到工作表
# ws.append([i['RecruitPostName'], i['CountryName'], i['LocationName'], i['BGName'], i['CategoryName'], i['Responsibility'], i['LastUpdateTime'], i['PostURL']])
# # 保存工作表
# wb.save('tencentjobdesc.xlsx')
# print("存储完成")
db, cursor = cur()
sum = 0
for i in data_list:
CategoryId = categoryid
RecruitPostName = i['RecruitPostName']
CountryName = i['CountryName']
LocationName = i['LocationName']
BGName = i['BGName']
CategoryName = i['CategoryName']
Responsibility_new = i['Responsibility']
LastUpdateTime = i['LastUpdateTime']
if get_information(i["PostURL"], headers=headers) != False:
Responsibility, Requirement = get_information(i["PostURL"], headers=headers)
num = insert(cursor, RecruitPostName, CountryName, LocationName, BGName, CategoryName, Responsibility_new,
LastUpdateTime,
Responsibility, Requirement, CategoryId)
sum = num + sum
print(sum)
if sum > 0:
print("添加成功")
db.commit()
cursor.close()
db.close()
else:
print("失败")
db.rollback()
将一类看为一个事件,成功后提交,否则报错。
(此判断条件为有一个成功即成功)
之后就是下载详细页面信息,通过观察发现只需要获取两种信息即可,此处采用其接口处的命名
# 获取页面详细信息
def get_information(url, headers):
text = re.findall(r'postId=(.*)', url)
if text:
print(text)
if text[0] != '0':
url = "https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1640351123846&postId={0}&language=zh-cn".format(
text[0])
print(url)
html = requests.get(url, headers=headers).text
text = json.loads(html)
Responsibility = text.get('Data')['Responsibility']
Requirement = text['Data']['Requirement']
return Responsibility, Requirement
else:
Responsibility = ""
Requirement = ""
return Responsibility, Requirement
else:
return False
之后是插入数据库
# 插入数据
def insert(cursor, RecruitPostName, CountryName, LocationName, BGName, CategoryName, Responsibility_new, LastUpdateTime,
Responsibility, Requirement, CategoryId):
# sql = "insert into tencentjob(RecruitPostName,CountryName,LocationName,BGName,CategoryName,Responsibility_new,LastUpdateTime,Responsibility, Requirement) values ('{0}','{1}','{2}','{3}','{4}','{5}','{6}','{7}','{8}')".format(
# RecruitPostName, CountryName, LocationName, BGName, CategoryName, Responsibility_new, LastUpdateTime,
# Responsibility, Requirement)
Requirement = str(Requirement).strip().replace("'", "''") # 用replace是为了解决数据中存在单引号的问题
Responsibility = str(Responsibility).strip().replace("'", "''") # 用replace是为了解决数据中存在单引号的问题
Responsibility_new = str(Responsibility_new).strip().replace("'", "''") # 用replace是为了解决数据中存在单引号的问题
sql = "insert into tencentjob(RecruitPostName,CountryName,LocationName,BGName,CategoryName,Responsibility_new,LastUpdateTime,Responsibility, Requirement, CategoryId) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
i = cursor.execute(sql, (
RecruitPostName, CountryName, LocationName, BGName, CategoryName, Responsibility_new, LastUpdateTime,
Responsibility, Requirement, CategoryId))
return i
到此,爬虫部分也就结束了。
数据库上也采用了个函数来定义连接
def cur():
db = pymysql.connect(host="localhost", user="用户名", password="密码", database="数据库名")
cursor = db.cursor()
return db, cursor