腾讯招聘
-
相关代码实现:
-
import requests
from lxml import etree
Base_domain=“https://hr.tencent.com/”
headers={
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36’
}
def get_detail_urls(url):- response=requests.get(url,headers=headers)
text=response.text
html=etree.HTML(text)
detail_urls=html.xpath("//div[@id=‘position’]/div[1]/table/tr/td/a/@href")
detail_urls = map(lambda url: Base_domain + url, detail_urls)
return detail_urls
- response=requests.get(url,headers=headers)
-
def get_detail_page(url):
- response=requests.get(url,headers=headers)
text=response.content.decode()
html=etree.HTML(text)
item={}
- item[‘name’]=html.xpath("//table[@class=‘tablelist textl’]//tr[@class=‘h’]/td[@id=‘sharetitle’]/text()")
- item[“area”]=html.xpath("//table[@class=‘tablelist textl’]//tr[@class=‘c bottomline’]/td[1]/text()")
- item[‘num’]=html.xpath("//table[@class=‘tablelist textl’]//tr[@class=‘c bottomline’]/td[3]/text()")
- item[“assig”]=html.xpath("//table[@class=‘tablelist textl’]//tr[@class=‘c’][1]/td[@class=‘l2’]/ul/li/text()")
- item[“require”]=html.xpath("//table[@class=‘tablelist textl’]//tr[@class=‘c’][2]/td[@class=‘l2’]//li/text()")
return item
- response=requests.get(url,headers=headers)
-
def spider():
- url_list=“https://hr.tencent.com/position.php?&start={}#a”
#求前七页的信息
items=[]
for x in range(0,8):- urls=url_list.format(x*10)
print(urls)
detail_urls = get_detail_urls(urls)
for detail_url in detail_urls:- item = get_detail_page(detail_url)
items.append(item)
print(items)
- item = get_detail_page(detail_url)
- urls=url_list.format(x*10)
- url_list=“https://hr.tencent.com/position.php?&start={}#a”
-
if name == ‘main’:
spider()
注意:
- 1.每敲完一个函数需要print()一下看啊可能是否 ‘xpath’ 提取错误,反正我是吃了很多这样的亏,改了很久才发现错误出现在这儿。
- 2.熟练的掌握 ‘xpath’ 经常利用 ’ xpath helper ’ 去检索自己的’ xpath ’ 代码,久而久之,’ xpath ‘ 这个dot就没啥大问题儿了。