简单的爬虫分享
XX招聘网站的简单爬虫,只限交流学习使用,严禁非法使用,非法使用概不负责。
代理主要使用了requests+BeautifulSoup,还有少量正则表达式。
也加入了代理池,在此感谢J_hao104,提供的获取代理的方法。
最后将爬取的数据存入Excel中。Python版本3.6,在这里依然推荐Anaconda管理Python版本及其工具包。
为了使爬取更加顺利,必须把爬虫伪装成浏览器。并且不能频繁抓取。得sleep一会再抓。
最后贴出全部code。只限交流学习使用。代码中不足的地方还望同学们自己优化啦!
#-*- encoding: utf-8 -*-
import requests
import pandas as pd
import random
import time
from bs4 import BeautifulSoup as BS
import re
from fake_useragent import UserAgent
ua = UserAgent()
headers = {
'User-Agent': ua.chrome,
'Host': 'sou.zhaopin.com',
'Referer': 'https://www.zhaopin.com/',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9'
}
#获取代理ip
def get_proxy():
return requests.get("http://123.207.35.36:5010/get/").content
get_proxy=get_proxy()
#print(get_proxy)
def getHtml(start,city,kw):
url = "https://fe-api.zhaopin.com/c/i/sou?start=" + str(start) + \
"&pageSize=90&cityId=" +city+\
"&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&sortType=publish&kw="+kw+\
"&kt=3&lastUrlQuery=%7B%22jl%22:%22538%22,%22kw%22:%22%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88%22,%22kt%22:%223%22%7D&_v=0.79472005&" \
"x-zp-page-request-id=15621c9c7abd41679024b2ac5cf0f992-1541566903009-311773"
#print(url)
#response = requests.get(url, headers)
response = requests.get(url, headers,proxies={"http": "http://{}".format(get_proxy)})
html = response.json()
return html
def getOnePageJobs(html):
global job_datas
job_datas = html["data"]["results"]
jobs = pd.DataFrame(
# columns=["compay", "type", "size", "jobName", "salary", "workingExp", "eduLevel", "welfare", "createDate",
# "endDate", "city", "geo_lon", "geo_lat"])
columns = ["公司名称", "公司类型", "公司规模", "岗位名称", "薪水", "工作经验", "学历", "岗位亮点", "创建时间",
"截止时间", "城市", "位置坐标经度", "位置坐标纬度", "岗位职责"])
for i in range(len(job_datas)):
company = job_datas[i]["company"]["name"]
company_type = job_datas[i]["company"]["type"]["name"]
company_size = job_datas[i]["company"]["size"]["name"]
jobName = job_datas[i]["jobName"]
salary = job_datas[i]["salary"]
workingExp = job_datas[i]["workingExp"]['name']
eduLevel = job_datas[i]["eduLevel"]["name"]
welfare = job_datas[i]["welfare"]
createDate = job_datas[i]["createDate"]
endDate = job_datas[i]["endDate"]
positionURL = job_datas[i]["positionURL"]
city = job_datas[i]["city"]["display"]
geo_lon = job_datas[i]['geo']["lon"]
geo_lat = job_datas[i]['geo']["lat"]
#爬取第二层
response = requests.get(positionURL,proxies={"http": "http://{}".format(get_proxy)})
soup = BS(response.text, 'html.parser')
content=soup.find_all('div', class_='pos-ul')
content=str(content).encode('utf-8')
pattern = re.compile(u"[\u4e00-\u9fa5]+")
content = re.findall(pattern, content.decode('utf-8'))
#新魏华文楷体宋体微软雅黑磅
content = re.sub("[新魏华文楷体宋体微软雅黑磅\[\]\"\'\,\.\,\。]","", str(content).encode('utf-8').decode('utf-8'))
job = pd.DataFrame(
[company, company_type, company_size, jobName, salary, workingExp, eduLevel, welfare, createDate, endDate,
city, geo_lon, geo_lat,content]).T
# job.columns = ["compay", "type", "size", "jobName", "salary", "workingExp", "eduLevel", "welfare", "createDate",
# "endDate", "city", "geo_lon", "geo_lat","content"]
job.columns = ["公司名称", "公司类型", "公司规模", "岗位名称", "薪水", "工作经验", "学历", "岗位亮点", "创建时间",
"截止时间", "城市", "位置坐标经度", "位置坐标纬度","岗位职责"]
jobs = pd.concat([jobs, job], ignore_index=True)
return jobs
if __name__ == "__main__":
jobs = []
i = 1
city = input("请输入您想待的城市:")
kw = input("请输入您想干的岗位名称:")
page = input("请输入您想查询前多少页的内容:")
while i<(int(page)+1):
print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+" 正在爬取第" + str(i) + "页...")
start = 90 * (i - 1)
startTime = time.time()
html = getHtml(start,city,kw)
onepagejobs = getOnePageJobs(html)
jobs.append(onepagejobs)
# 随机生成1-5之间的实数,用于页面的停留时长,防止反爬
s = random.randint(1, 5)
time.sleep(s)
endTime = time.time()
print("爬取第" + str(i) + "页,耗时:" + str(endTime - startTime)+"s")
i += 1
print("全部爬取完毕!")
alljobs = pd.concat(jobs, ignore_index=True)
alljobs.to_excel(str(city)+"_"+str(kw)+"_"+"智联招聘.xls")