51job词云
51job词云步骤:需要将职位信息爬取后放到txt文件中,然后用jieba和wordcloud进行词云分析。
1. 职位信息爬取
def position_url(url):
s = requests.session()
s.proxies = {"http":"121.237.149.44:3000","http":"117.88.4.132:3000"}
s.get(url)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3754.400 QQBrowser/10.5.4020.400",
"Connection": "close"
}
response = requests.get(url, headers=headers, timeout=5).content.decode("gbk")
positions_city = re.findall(r'<input autocomplete="off".*?value="(.*?)".*?>.*?', response, re.DOTALL)
new_position_city = ''.join(positions_city)
position_type = re.findall(r'<p class="ipt">.*?<input.*?value="(.*?)".*?>.*?</p>', response, re.DOTALL)
new_position_type = ''.join(position_type)
positions_name = list(re.findall(r'<span>.*?<a target=".*?" title="(.*?)".*?>.*?</a></span>', response, re.DOTALL))
positions_company = list(re.findall(r'<span class="t2"><a target=".*?".*?>(.*?)</a></span>', response, re.DOTALL))
positions_location = list(re.findall(r'<div class="el">.*?<span class="t3">(.*?)</span>.*?</div>', response, re.DOTALL))
positions_money = list(re.findall(r'<div class="el">.*?<span class="t4">(.*?)</span>.*?</div>', response, re.DOTALL))
positions_links = list(re.findall(r'<span>.*?<a target=".*?" title=".*?" href="(.*?)".*?>.*?</a></span>', response, re.DOTALL))
positions_informations = list()
count = 0
for positions_link in positions_links:
positions_information = xinxi(positions_link, headers)
positions_informations.append(positions_information)
count += 1
if count == 50:
break
positons = list()
for i in range(50):
values = {
"职位名称": positions_name[i],
"公司名称": positions_company[i],
"公司地址": positions_location[i],
"职位薪资": positions_money[i],
"职位信息链接": positions_links[i],
"职位信息": positions_informations[i]
}
positons.append(values)
print(positons)
positions_excel(positons, new_position_city, new_position_type)
2. 爬取职位更精确的信息
def xinxi(position_link, headers):
responses = requests.get(position_link, headers).content.decode("gbk","ignore")
position_information = re.findall(r'<div class="bmsg job_msg inbox">(.*?)</div>', responses, re.DOTALL)
for new_position_information in position_information:
position_information_new = re.sub(r'<.*?>', '', new_position_information)
position_information_final = position_information_new.replace(' ', '')
return position_information_final
3. 将数据存放到csv中
def positions_excel(position, position_city, position_type):
H = "2020年" + position_type + "职位信息表.txt"
headers =["职位名称", "公司名称", "公司地址", "职位薪资", "职位信息链接", "职位信息"]
with open(H, "a+", encoding="utf-8-sig", newline="")as f:
writer = csv.DictWriter(f,headers)
writer.writeheader()
writer.writerows(position)
4. 多线程使速度更快
def BeiJing():
types = ["Python", "JAVA", "golang", "C%252B%252B", "web", "前端", "node.js", "C%2523"]
for i in range(1,100):
for type in types:
urls = ["https://search.51job.com/list/010000,000000,0000,00,9,99,%s,2,%s.html" % (type, i)]
for url in urls:
position_url(url)
def ShangHai():
types = ["Python", "JAVA", "golang", "C%252B%252B", "web", "前端", "node.js", "C%2523"]
for i in range(1,100):
for type in types:
urls = ["https://search.51job.com/list/020000,000000,0000,00,9,99,%s,2,%s.html" % (type, i)]
for url in urls:
position_url(url)
def GuangZhou():
types = ["Python", "JAVA", "golang", "C%252B%252B", "web", "前端", "node.js", "C%2523"]
for i in range(1, 100):
for type in types:
urls = ["https://search.51job.com/list/030200,000000,0000,00,9,99,%s,2,%s.html" % (type, i)]
for url in urls:
position_url(url)
def ShenZhen():
types = ["Python", "JAVA", "golang", "C%252B%252B", "web", "前端", "node.js", "C%2523"]
for i in range(1, 100):
for type in types:
urls = ["https://search.51job.com/list/040000,000000,0000,00,9,99,%s,2,%s.html" % (type, i)]
for url in urls:
position_url(url)
def WuHan():
types = ["Python", "JAVA", "golang", "C%252B%252B", "web", "前端", "node.js", "C%2523"]
for i in range(1, 100):
for type in types:
urls = ["https://search.51job.com/list/180200,000000,0000,00,9,99,%s,2,%s.html" % (type, i)]
for url in urls:
position_url(url)
def NanJing():
types = ["Python", "JAVA", "golang", "C%252B%252B", "web", "前端", "node.js", "C%2523"]
for i in range(1, 100):
for type in types:
urls = ["https://search.51job.com/list/070200,000000,0000,00,9,99,%s,2,%s.html" % (type, i)]
for url in urls:
position_url(url)
def main():
t1 = threading.Thread(target=BeiJing)
t2 = threading.Thread(target=ShangHai)
t3 = threading.Thread(target=GuangZhou)
t4 = threading.Thread(target=ShenZhen)
t5 = threading.Thread(target=WuHan)
t6 = threading.Thread(target=NanJing)
t1.start()
t2.start()
t3.start()
t4.start()
t5.start()
t6.start()
5. 词云调用
types = ["Python", "JAVA", "golang", "C#", "web", "前端", "node.js", "C++"]
for type in types:
ciyun(type)
6. 词云
def ciyun(position_type):
text_from_file = open('2020年'+ position_type+'职位信息表.txt', 'r', encoding='utf-8').read()
word_jieba = jieba.cut(text_from_file)
word_space = ' '.join(word_jieba)
stopwords = {'关键字', '类别', '职能', 'jobs', 'job', '要求', '任职'}
my_wordcloud = WordCloud(
background_color='while',
max_words=200,
stopwords=stopwords,
font_path='arial.ttf',
max_font_size=100,
random_state=50
).generate(word_space)
plt.figure('Python')
plt.imshow(my_wordcloud)
plt.axis('off')
plt.show()