python爬虫:爬取猎聘网站职位详情
第一次学习python,也是刚开始学习爬虫,完成的第一个实例,记录一下。
baseurl.py
# @author centao
# @time 2020.10.24
class findbaseurl:
def detbaseurl(self):
dqs = ""
indexcity = 1
indexoccupation = 1
while (indexoccupation == 1):
print("请输入想要查找的职位:数据挖掘,心理学,java,web前端工程师")
occupation = input()
if occupation != "数据挖掘" and occupation != "心理学" and occupation != "java" and occupation != "web前端工程师":
print("职位选择错误!")
indexoccupation = 1
else:
self.occu=occupation
indexoccupation = 0
while (indexcity == 1):
print("请输入城市:北京,上海,广州,深圳,杭州")
city = input()
if city == "北京":
dqs = "010"
indexcity = 0
self.ci=city
elif city == "上海":
das = "020"
indexcity = 0
self.ci = city
elif city == "广州":
dqs = "050020"
indexcity = 0
self.ci = city
elif city == "深圳":
dqs = "050090"
indexcity = 0
self.ci = city
elif city == "杭州":
dqs = "070020"
indexcity = 0
self.ci = city
else:
print("输入城市有误")
indexcity = 1
url = "https://www.liepin.com/zhaopin/?key=" + occupation + "&dqs=" + dqs + "&curPage="
return url
这是一个简单的类,里面是输入城市,职位,然后生成对应的url。其实也可以不要,我只是单纯为了练习一下python类的构造。
demo.py
# @author centao
# @time 2020.10.24
from bs4 import BeautifulSoup #网页解析
import re #正则表达式,进行文字匹配
import requests #制定URL,获取网页数据
from bs4 import BeautifulSoup # 网页解析
import random
import time
from baseurl import findbaseurl
def main():
u=findbaseurl()
baseurl = u.detbaseurl()
savepathlink=u.occu+u.ci+"link.txt"
savepathdetail=u.occu+u.ci+"详情.txt"
#1.爬取网页
urllist=getworkurl(baseurl)
#3.保存职位详情链接
savedata(savepathlink,urllist)
a = readlink(savepathlink)
askURL(urllist,savepathdetail)
user_Agent = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36 Edg/80.0.361.54',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41',
'Opera/9.80 (Macintosh; Intel Mac OS X; U; en) Presto/2.2.15 Version/10.00',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0)',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)']
proxy_list=[{'HTTP':'http://61.135.185.156:80'},
{'HTTP':'http://61.135.185.111:80'},
{'HTTP':'http://61.135.185.112:80'},
{'HTTP':'http://61.135.185.160:80'},
{'HTTP':'http://183.232.231.239:80'},
{'HTTP':'http://202.108.22.5:80'},
{'HTTP':'http://180.97.33.94:80'},
{'HTTP':'http://182.61.62.74:80'},
{'HTTP':'http://182.61.62.23:80'},
{'HTTP':'http://183.232.231.133:80'},
{'HTTP':'http://183.232.231.239:80'},
{'HTTP':'http://220.181.111.37:80'},
{'HTTP':'http://183.232.231.76:80'},
{'HTTP':'http://202.108.23.174:80'},
{'HTTP':'http://183.232.232.69:80'},
{'HTTP':'http://180.97.33.249:80'},
{'HTTP':'http://180.97.33.93:80'},
{'HTTP':'http://180.97.34.35:80'},
{'HTTP':'http://180.97.33.249:80'},
{'HTTP':'http://180.97.33.92:80'},
{'HTTP':'http://180.97.33.78:80'}]
#得到指定一个URL的网页内容
def askmainURL(url):
#模拟浏览器头部信息
head={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"}
html=requests.get(url=url)
return html
#得到工作职位的详情链接
def getworkurl (baseurl):
urllist=[]
for a in range(1,5):
url = baseurl+str(a)
html = askmainURL(url)
soup = BeautifulSoup(html.text, "html.parser")
soup = soup.find_all("h3")
for i in soup:
if i.has_attr("title")!=0:
if i.has_attr("title"):
href = i.find_all("a")[0]["href"]
if re.search("https", href):
urllist.append(href)
else:
href = "https://www.liepin.com" + href
urllist.append(href)
else:
continue
return urllist
#保存数据
# -------------------------爬取detail-------------------------------------------
def readlink(savepathlink):
file = open(savepathlink, mode='r')
contents = file.readlines()
file.close()
return contents
def askURLsingle(url):
heads = {"User-Agent": random.choice(user_Agent)}
# html=requests.get(url= a,headers=heads, timeout=(3,7))
try:
html = requests.get(url=url, headers=heads, proxies=proxy_list[random.randint(0, 20)],timeout=(3,7))
html.encoding = 'utf-8'
return html
except requests.exceptions.RequestException as e:
print(e)
def askURL(url,savepath):
for i in range(len(url)):
print(url[i])
a=url[i]
heads = {"User-Agent": random.choice(user_Agent)}
try:
# html = requests.get(url=a, headers=heads,proxies=proxy_list[random.randint(0, 20)], timeout=(3, 7))
html = requests.get(url=a, headers=heads,timeout=(3,7))
html.encoding='uft-8'
soup = BeautifulSoup(html.text, "html.parser")
item = soup.find_all("div", class_="content content-word")
time.sleep(1)
except requests.exceptions.RequestException as e:
print(e)
continue
if len(item) != 0:
item=item[0]
item=item.text.strip()
savedetail(savepath,item)
else:
a=0
while(len(item)==0 and a<10):
h=askURLsingle(url[i])
soup = BeautifulSoup(h.text, "html.parser")
item = soup.find_all("div", class_="content content-word")
a=a+1
if len(item) != 0:
item = item[0]
item = item.text.strip()
savedetail(savepath, item)
def savedetail(savepath,data):
file=open(savepath,mode='a',errors='ignore')
file.write(data+'\n')
file.close()
def savedata(savepath,urllist):
file=open(savepath,mode='a')
for item in urllist:
file.write(item+'\n')
file.close()
if __name__ == '__main__':
main()
demo.py里面是爬取职位详情连接和职位详情页的代码
createdict.py
#coding:utf-8
# @author centao
# @time 2020.10.24
import re
import jieba.analyse as analyse
import jieba
# with open('java上海详情.txt','r',encoding='GBK')as f:
# text=f.read()
# 清洗数据:
# # 去除序号
def main():
print("输入职位")
a=input()
create(a)
def create(a):
text=read(a+'上海详情.txt')+read(a+'北京详情.txt')+read(a+'深圳详情.txt')+read(a+'广州详情.txt')+read(a+'杭州详情.txt')
text=str(text)
text=re.sub(r'([0-9 a-z]+[\.\、,,))])|( [0-9]+ )|[;;]', '',text) # 去除序号等无关数字
text=re.sub(r'[,、。【】()/]', ' ',text)# 去除标点
stopword=['熟悉','需要','岗位职责','职责描述','工作职责','任职','优先','项目','团队','产品','相关','任职','业务','要求','文档','工作','能力',
'优化','需求','并发','经验','完成','具备','职责','具有','应用','平台','参与','编写','了解','调优','使用','服务','代码','性能', '缓存',
'中间件','解决','海量', '场景', '技术', '用户', '进行', '负责', '领域','系统','构建', '招聘', '专业', '课程', '公司', '员工', '人才', '学习', '组织', '岗位',
'薪酬', '运营', '制定', '体系', '发展', '完善', '提供', '学员', '学生', '流程', '定期', '行业', '描述', '策划', '内容', '协助', '方案']
keyword=jieba.lcut(text,cut_all=False)
out=' ' #清洗完之后的结果
keywords=" ".join(keyword)
for word in keyword:
if word not in stopword:
if word !='\t':
out += word
out += " "
#TF-IDF分析
tf_idf=analyse.extract_tags(out,
topK=100,
withWeight=False,
allowPOS=( "n", "vn", "v"))
print(tf_idf)
savedata(a+'字典.txt',tf_idf)
def read(savepathlink):
file = open(savepathlink, mode='r')
contents = file.readlines()
file.close()
return contents
def savedata(savepath,urllist):
file=open(savepath,mode='a')
for item in urllist:
file.write(item+'\n')
file.close()
if __name__ == '__main__':
main()
这是创建字典的代码,使用了jieba和TF-IDF分析,为了数据清洗准备。
fenci_ciyun.py
#coding:utf-8
# @author centao
# @time 2020.10.24
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re
import numpy as np
from PIL import Image
import jieba
# 数据获取
def main():
print("输入职业:")
occu=input()
picture(occu)
def read(savepathlink):
file = open(savepathlink, mode='r',errors='ignore')
contents = file.readlines()
file.close()
return contents
def picture(occu):
text = read(occu + '上海详情.txt') + read(occu + '北京详情.txt') + read(occu + '深圳详情.txt') + read(occu + '广州详情.txt') + read( occu + '杭州详情.txt')
text = str(text)
# 清洗数据:
# 去除序号
text=re.sub(r'([0-9 a-z]+[\.\、,,))])|( [0-9]+ )|[;;]', '',text)
#去除标点符号
text=re.sub(r'[,、。【】()/]', ' ',text)
keyword=jieba.lcut(text,cut_all=False)
tx=read(occu+"字典.txt")
out=''
for word in keyword:
if word+'\n' in tx:
out += word
out +=" "
else:
continue
# keywords=" ".join(keyword)
mask=np.array(Image.open('2.jpg'))
font=r'C:\Windows\Fonts\simkai.ttf'
wc=WordCloud(
font_path=font,#使用的字体库
margin=2,
mask=mask,#背景图片
background_color='white', #背景颜色
max_font_size=300,
# min_font_size=1,
width=5000,
height=5000,
max_words=200,
scale=3
)
wc.generate(out) #制作词云
wc.to_file(occu+'词云.jpg') #保存到当地文件
# 图片展示
plt.imshow(wc,interpolation='bilinear')
plt.axis('off')
plt.show()
if __name__ == '__main__':
main()
这是最终根据各个详情txt文件以及创建的字典,构建词云图的代码。
如下图所示是web前端工程师词云图和java词云图