分析过程与链家是一样的。
腾讯社招完整代码如下:
import requests
from lxml import etree
from mysql_class import Mysql # 自己封装好的Mysql类
def txshezhao(keywords, page):
'''
:param keywords: 指定搜索关键字进行数据爬取
:param page: 用来控制爬取页码范围
:return: 将相关信息存储于text数据库的tengxun表中
'''
count = 0
while count <= page: # 指定爬取前20页
url = 'https://hr.tencent.com/position.php?keywords={}&lid=2156&tid=87&start={}#a'.format(keywords, count*10)
count += 1
headers = {
'Cookie': '_ga=GA1.2.552710032.1529846866; pgv_pvi=5319122944; PHPSESSID=a7let8q1aup7j9p40mubjq8h64; pgv_si=s6819970048',
'Host': 'hr.tencent.com',
'Referer': 'https://hr.tencent.com/position.php?keywords=%E8%AF%B7%E8%BE%93%E5%85%A5%E5%85%B3%E9%94%AE%E8%AF%8D&lid=2156&tid=87',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
}
res = requests.get(url, headers=headers)
html = etree.HTML(res.text)
for every in range(2, 12): # (2,12)
res_href = html.xpath('//table[@class="tablelist"]/tr[{}]/td[1]/a/@href'.format(every))
href = 'https://hr.tencent.com/' + res_href[0]
# print(href) # 拿到每一页的10个岗位的URL
res = requests.get(href, headers=headers)
# print(res.text)
html1 = etree.HTML(res.text)
info1 = html1.xpath('//td[@id="sharetitle"]//text()')
job_name = str(info1[0])
# print(job_name)
res_msg = html1.xpath('//tr[@class ="c bottomline"]/td//text()')
# print(res_msg) # ['工作地点:', '北京', '职位类别:', '技术类', '招聘人数:', '1人']
address = str(res_msg[1])
# print(address) # 北京
category = str(res_msg[3])
# print(category)
number = str(res_msg[5])
# print(number)
information_list = html1.xpath('//table[@class="tablelist textl"]/tr[4]/td/ul//text()')
req_info = ''
for req_info1 in information_list:
message = str(req_info1)
req_info += message
information = req_info
# print(information)
data = (job_name, address,category, number, information)
Insert.mysql_op(sql, data)
if __name__ == '__main__':
# MySQL语句
Insert = Mysql()
# 要执行的sql 语句
sql = '''INSERT INTO tengxun (job_name, address, category, number, information) VALUES(%s, %s, %s, %s, %s)'''
print('请在下面输入关键字进行爬取数据:')
keywords = input()
txshezhao(keywords, 5)
拉钩网完整代码如下:
import requests
from lxml import etree
import pymysql
class Mysql(object):
'''执行数据操作封装类'''
def __init__(self):
'''连接数据库、创建游标'''
self.db = pymysql.connect(host="localhost", user="root", password="8888", database="test")
self.cursor = self.db.cursor()
def mysql_op(self, sql, data):
'''MySQL语句'''
self.cursor.execute(sql, data)
self.db.commit()
def __del__(self):
'''关闭游标、关闭数据库'''
self.cursor.close()
self.db.close()
# MySQL语句
Insert = Mysql()
# 要执行的sql 语句
sql = '''INSERT INTO lagou (company, job_name, salary, adress, jingyan, school,job_des) VALUES(%s, %s, %s, %s, %s, %s, %s)'''
url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false'
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
# 'Cookie': 'JSESSIONID=ABAAABAAAGFABEF780FE198208BF21A58749B6B7C26C915; _ga=GA1.2.1321423683.1534510673; _gid=GA1.2.581729554.1534510673; user_trace_token=20180817205757-29e3715f-a21d-11e8-a9f0-5254005c3644; LGUID=20180817205757-29e375b6-a21d-11e8-a9f0-5254005c3644; index_location_city=%E5%8C%97%E4%BA%AC; TG-TRACK-CODE=search_code; X_HTTP_TOKEN=87d99de12746e518d50f2fe7fede59a0; PRE_UTM=; _gat=1; LGSID=20180818000633-829355a0-a237-11e8-a9f0-5254005c3644; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D33WZv6WWqh6LDiUr0dWxB6F4E9letiquzVMR10EQdIG%26wd%3D%26eqid%3Dd381bb6900049a8a000000035b76c64a; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1534510675,1534521990,1534522180; LGRID=20180818001023-0bf26e44-a238-11e8-91ae-525400f775ce; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1534522221; SEARCH_ID=86e7b49dc5cc476fb33fbb41c7409cf0',
'Cookie': 'JSESSIONID=ABAAABAAAGFABEF780FE198208BF21A58749B6B7C26C915; _ga=GA1.2.1321423683.1534510673; _gid=GA1.2.581729554.1534510673; user_trace_token=20180817205757-29e3715f-a21d-11e8-a9f0-5254005c3644; LGUID=20180817205757-29e375b6-a21d-11e8-a9f0-5254005c3644; index_location_city=%E5%8C%97%E4%BA%AC; TG-TRACK-CODE=search_code; X_HTTP_TOKEN=87d99de12746e518d50f2fe7fede59a0; PRE_UTM=; LGSID=20180818000633-829355a0-a237-11e8-a9f0-5254005c3644; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D33WZv6WWqh6LDiUr0dWxB6F4E9letiquzVMR10EQdIG%26wd%3D%26eqid%3Dd381bb6900049a8a000000035b76c64a; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1534510675,1534521990,1534522180; LGRID=20180818001050-1bed8a51-a238-11e8-91ae-525400f775ce; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1534522248; SEARCH_ID=444ab1d908b04a32b195b1ac433ef583',
'Host': 'www.lagou.com',
'Origin': 'https://www.lagou.com',
'Referer': 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput=',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'X-Anit-Forge-Code': '0',
'X-Anit-Forge-Token': 'None',
'X-Requested-With': 'XMLHttpRequest',
}
for page in range(1, 30):
form = {
'first': 'false',
'pn': page,
'kd': '数据分析'
}
response = requests.post(url, headers=headers, data=form)
html = response.json()
for url0 in range(15): # 15
info = html["content"]["positionResult"]["result"][url0]["positionId"] # 4605300-----<class 'int'>
url1 = 'https://www.lagou.com/jobs/' + str(info) + '.html'
# print(url1)
res = requests.get(url1, headers=headers)
res_html = res.text
res_element = etree.HTML(res_html)
if res_element.xpath('//div[@class="job-name"]/div[1]') == []:
break
company = res_element.xpath('//div[@class="job-name"]/div[1]')[0].text
job_name = res_element.xpath('//div[@class="job-name"]/span')[0].text
salary = res_element.xpath('//dd[@class="job_request"]/p/span[1]')[0].text
adress = res_element.xpath('//dd[@class="job_request"]/p/span[2]')[0].text
jingyan = res_element.xpath('//dd[@class="job_request"]/p/span[3]')[0].text
school = res_element.xpath('//dd[@class="job_request"]/p/span[4]')[0].text
# description = res_element.xpath('//dd[@class="job_bt"]/h3')[0].text
# print(description)
des_msg = res_element.xpath('//dd[@class="job_bt"]/div//text()')
# print(des_msg)
job_des = ''
for des_msg_one in des_msg:
job_des += str(des_msg_one).strip('\n')
print(job_des)
data = (str(company), str(job_name), str(salary), str(adress).strip('/'), str(jingyan).strip('/'), str(school).strip('/'), str(job_des))
Insert.mysql_op(sql, data)
链家、拉勾、Boss、等等这些网页可以拿来学习练手,请不要过多的爬取数据。