入门学习python爬虫(二)通过beautifulsoup优雅爬取新华三的招聘信息
- 初始化保存文件
创建类,初始化参数
class My(object):
def __init__(self):
path="D://shuju"
file="XH3.json"
self.file_path=os.path.join(path,file)
self.fp=open(self.file_path,"a",encoding="utf-8")
- 通过动态改变header,防止反爬封ip
每次请求动态改变header里的值,模拟浏览器请求过程。
def get_soup(self,url):
a = [
"Mozilla/5.0 (Windows NT 6.1; Win64; rv:27.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:27.0) Gecko/20100101 Firfox/27.0"
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:10.0) Gecko/20100101 Firfox/10.0"
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/21.0.1180.110 Safari/537.36"
"Mozilla/5.0 (X11; Ubuntu; Linux i686 rv:10.0) Gecko/20100101 Firfox/27.0"
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/34.0.1838.2 Safari/537.36"
"Mozilla/5.0 (X11; Ubuntu; Linux i686 rv:27.0) Gecko/20100101 Firfox/27.0"
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
]
random_header = random.choice(a)
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'User-Agent': random_header
}
response=requests.get(url,headers=headers).content.decode("utf-8")
soup = BeautifulSoup(response, "html5lib")
return soup
- 获取当前页数
方法一 通过正则表达式取出页数
def get_page(self,url):
soup =self.get_soup(url)
a=soup.select(".tablefooter span")[0].text
page=re.findall("当前第1/(\d+)页",a)
return int(page[0])
方法二 通过bs4选择器,选出页数
def get_page2(self,url):
soup =self.get_soup(url)
a=soup.select(".tablefooter span a")[3]
page = parse_qs(a['href'])['PageIndex'][0]
return page
- 获取职位详情
def get_urls(self,item,url):
url="https://h3c.zhiye.com/"+url
print(url)
soup=self.get_soup(url)
nwe=soup.select(".nvalue")
item["招聘类别"]=nwe[0].text.replace("\n","").replace(" ","")
item["工作性质"]=nwe[1].text.replace("\n","").replace(" ","")
item["薪资范围"]=nwe[2].text.replace("\n","").replace(" ","")
item["招聘类别"]=nwe[3].text.replace("\n","").replace(" ","")
item["发布时间"]=nwe[4].text.replace("\n","").replace(" ","")
item["截止时间"]=nwe[5].text.replace("\n","").replace(" ","")
new=soup.select(".xiangqingtext p")
item["工作地点"]=new[1].text.replace("\n","").replace(" ","")
item["工作职责"]=new[3].text.replace("\n","").replace(" ","").replace("\t","")
item["任职资格"]=new[5].text.replace("\n","").replace(" ","").replace("\t","")
- 获取职位信息
通过url数据解析招聘信息列表数据,动态请求睡眠。
def crewl(self,page):
for i in range(1,page+1):
url="https://h3c.zhiye.com/search/?p=1%5E-1%2C3%5E-1&PageIndex={}".format(i)
test=requests.get(url).text
soups=BeautifulSoup(test,"html5lib")
tr=soups.select(".listtable tbody tr")
tmes = [6,2,3,10,5,6,15,8,9]
tm=[1,2,3]
for t in tr:
item={}
name=t.findAll("td")[0].text.replace("\n","").replace(" ","")
lie=t.findAll("td")[1].text.replace("\n","").replace(" ","")
loc = t.findAll("td")[2].text.replace("\n", "").replace(" ", "")
newtime = t.findAll("td")[3].text.replace("\n", "").replace(" ", "")
urls = t.findAll("td")[0].a['href']
self.get_urls(item,urls)
times = random.choice(tm)
time.sleep(times)
item["name"]=name
item["lie"] = lie
item["loc"] = loc
item["time"] = newtime
print('正在保存数据%s' % item)
self.save(item)
- 保存文件
def save(self,item):
data=json.dumps(item,ensure_ascii=False)
self.fp.write(data+"\n")
- 关闭文件
def save_close(self):
self.fp.close()
print("数据保存在"+self.file_path)
- 调用主函数
if __name__ == '__main__':
my=My()
url="https://h3c.zhiye.com/search?r=-1&p=1%5E-1%2C3%5E-1&c=&d=&k=#jlt"
page=my.get_page(url)
my.crewl(page)
- 完整代码
import requests
from bs4 import BeautifulSoup
from urllib.parse import parse_qs
import random
import time
import json
import re
import os
class My(object):
def __init__(self):
path="D://shuju"
file="XH3.json"
self.file_path=os.path.join(path,file)
self.fp=open(self.file_path,"a",encoding="utf-8")
def get_soup(self,url):
a = [
"Mozilla/5.0 (Windows NT 6.1; Win64; rv:27.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:27.0) Gecko/20100101 Firfox/27.0"
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:10.0) Gecko/20100101 Firfox/10.0"
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/21.0.1180.110 Safari/537.36"
"Mozilla/5.0 (X11; Ubuntu; Linux i686 rv:10.0) Gecko/20100101 Firfox/27.0"
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/34.0.1838.2 Safari/537.36"
"Mozilla/5.0 (X11; Ubuntu; Linux i686 rv:27.0) Gecko/20100101 Firfox/27.0"
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
]
random_header = random.choice(a)
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'User-Agent': random_header
}
response=requests.get(url,headers=headers).content.decode("utf-8")
soup = BeautifulSoup(response, "html5lib")
return soup
#方法一
def get_page(self,url):
soup =self.get_soup(url)
a=soup.select(".tablefooter span")[0].text
page=re.findall("当前第1/(\d+)页",a)
return int(page[0])
#方法二
def get_page2(self,url):
soup =self.get_soup(url)
a=soup.select(".tablefooter span a")[3]
page = parse_qs(a['href'])['PageIndex'][0]
return page
def get_urls(self,item,url):
url="https://h3c.zhiye.com/"+url
print(url)
soup=self.get_soup(url)
nwe=soup.select(".nvalue")
item["招聘类别"]=nwe[0].text.replace("\n","").replace(" ","")
item["工作性质"]=nwe[1].text.replace("\n","").replace(" ","")
item["薪资范围"]=nwe[2].text.replace("\n","").replace(" ","")
item["招聘类别"]=nwe[3].text.replace("\n","").replace(" ","")
item["发布时间"]=nwe[4].text.replace("\n","").replace(" ","")
item["截止时间"]=nwe[5].text.replace("\n","").replace(" ","")
new=soup.select(".xiangqingtext p")
item["工作地点"]=new[1].text.replace("\n","").replace(" ","")
item["工作职责"]=new[3].text.replace("\n","").replace(" ","").replace("\t","")
item["任职资格"]=new[5].text.replace("\n","").replace(" ","").replace("\t","")
def crewl(self,page):
for i in range(1,page+1):
url="https://h3c.zhiye.com/search/?p=1%5E-1%2C3%5E-1&PageIndex={}".format(i)
test=requests.get(url).text
soups=BeautifulSoup(test,"html5lib")
tr=soups.select(".listtable tbody tr")
tmes = [6,2,3,10,5,6,15,8,9]
tm=[1,2,3]
for t in tr:
item={}
name=t.findAll("td")[0].text.replace("\n","").replace(" ","")
lie=t.findAll("td")[1].text.replace("\n","").replace(" ","")
loc = t.findAll("td")[2].text.replace("\n", "").replace(" ", "")
newtime = t.findAll("td")[3].text.replace("\n", "").replace(" ", "")
urls = t.findAll("td")[0].a['href']
self.get_urls(item,urls)
times = random.choice(tm)
time.sleep(times)
item["name"]=name
item["lie"] = lie
item["loc"] = loc
item["time"] = newtime
print('正在保存数据%s' % item)
self.save(item)
def save(self,item):
data=json.dumps(item,ensure_ascii=False)
self.fp.write(data+"\n")
def save_close(self):
self.fp.close()
print("数据保存在"+self.file_path)
if __name__ == '__main__':
my=My()
url="https://h3c.zhiye.com/search?r=-1&p=1%5E-1%2C3%5E-1&c=&d=&k=#jlt"
page=my.get_page(url)
my.crewl(page)