#-*- coding: utf-8 -*-
import urllib.request
import os,time
from bs4 import BeautifulSoup
#爬取智联招聘网站的招聘种类,然后再爬取某个种类的招聘信息,并将招聘信息以种类名称命名存储。
hds=[{'User-Agent': 'Mozilla/5.0 (Windows; U;Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}, \
{'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'}, \
{'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}]
position=[] #存放职位名称
href={} #存放职位的连接地址
url="http://sou.zhaopin.com/" #智联总网页
def search():
try:
req=urllib.request.Request(url,headers=hds[1])
html_resource=urllib.request.urlopen(req).read()
plain_text=html_resource.decode('utf8','ignore')
return plain_text
except (urllib.request.HTTPError,urllib.request.URLError) as e :
print(e)
def rmline(str):
#这是一个去除字符串中的空行的函数
data=''
for line in str:
l=line.strip()
if len(l)!=0:
data+=l
return data
def parse_html(html):
soup=BeautifulSoup(html,'lxml')
items=soup.find("div",class_="clearfixed").children
for item in items:
position.append(item.get_text())
href[item.get_text()]=item.get('href')
# for key in href:
# print(key+'----> '+href[key])
def search_href(href,hds):
try:
req=urllib.request.Request(href,headers=hds)
html_resource=urllib.request.urlopen(req).read()
plain_text = html_resource.decode('utf8', 'ignore')
return plain_text
except (urllib.request.HTTPError,urllib.request.URLError) as e:
print(e)
def parse_href_html(html,position_name):
position_name=position_name.replace("/",' ')
file=open(position_name+".txt",'w',encoding="utf-8")
soup=BeautifulSoup(html,'lxml')
items=soup.find_all("table",class_="newlist")
for item in items[1:]:
zwmc=item.find("td",class_="zwmc").div.a.string
gsmc=item.find("td",class_="gsmc").a.string
zwyx=item.find("td",class_="zwyx").string
gzdd=item.find("td",class_="gzdd").string
file.write(zwmc+' '+gsmc+' '+zwyx+' '+gzdd+'\n')
file.close()
if __name__=="__main__":
parse_html(search())
i=0;
for key in href:
parse_href_html(search_href(url+href[key],hds[i%3]),key)
i=i+1
爬取智联招聘信息并存储
最新推荐文章于 2024-04-13 00:38:24 发布