通过python爬虫,爬取新华三的招聘信息
1. 导包
导入http工具类和lxml解析器
import requests
from lxml import html
2. 获取每个岗位的信息
#查询49页数据
for i in range(1,49):
#网站url
url="https://h3c.zhiye.com/search/?PageIndex={}".format(i)
te=requests.get(url).text
soup=html.fromstring(te)
#通过html标签定位到数据
tr=soup.xpath("//table[@class='listtable']/tr/td/a/text()")
#文章详情url
href = soup.xpath("//table[@class='listtable']/tr/td/a/@href")
s=soup.xpath("//table[@class='listtable']/tr/td[2]")
r=soup.xpath("//table[@class='listtable']/tr/td[3]")
g=soup.xpath("//table[@class='listtable']/tr/td[4]/text()")
for t in range(len(tr)):
#获取岗位信息
print(tr[t].replace("\n","").replace(" ",""))
print(s[t].text)
print(r[t].text.replace("\n","").replace(" ",""))
print(g[t].replace("\n", "").replace(" ", ""))
3. 获取每个岗位的详情信息
通过上面函数获取到详情url,解析数据
def get_utls(url1):
url1 = 'https://h3c.zhiye.com' + url1
print(url1)
te1=requests.get(url1).text
soup1=html.fromstring(te1)
a=soup1.xpath("//li[@class='nvalue']/text()")
print(a[0].replace("\n","").replace(" ",""))
print(a[1].replace("\n", "").replace(" ", ""))
print(a[2].replace("\n", "").replace(" ", ""))
print(a[3].replace("\n", "").replace(" ", ""))
print(a[4].replace("\n", "").replace(" ", ""))
print(a[5].replace("\n", "").replace(" ", "").replace("\xa0", ""))
4. 完整代码
import requests
from lxml import html
#xpath
def get_utls(url1):
url1 = 'https://h3c.zhiye.com' + url1
print(url1)
te1=requests.get(url1).text
soup1=html.fromstring(te1)
a=soup1.xpath("//li[@class='nvalue']/text()")
print(a[0].replace("\n","").replace(" ",""))
print(a[1].replace("\n", "").replace(" ", ""))
print(a[2].replace("\n", "").replace(" ", ""))
print(a[3].replace("\n", "").replace(" ", ""))
print(a[4].replace("\n", "").replace(" ", ""))
print(a[5].replace("\n", "").replace(" ", "").replace("\xa0", ""))
for i in range(1,49):
url="https://h3c.zhiye.com/search/?PageIndex={}".format(i)
te=requests.get(url).text
soup=html.fromstring(te)
tr=soup.xpath("//table[@class='listtable']/tr/td/a/text()")
href = soup.xpath("//table[@class='listtable']/tr/td/a/@href")
s=soup.xpath("//table[@class='listtable']/tr/td[2]")
r=soup.xpath("//table[@class='listtable']/tr/td[3]")
g=soup.xpath("//table[@class='listtable']/tr/td[4]/text()")
for t in range(len(tr)):
print(tr[t].replace("\n","").replace(" ",""))
print(s[t].text)
print(r[t].text.replace("\n","").replace(" ",""))
print(g[t].replace("\n", "").replace(" ", ""))
get_utls(href[t])