爬取网站:www.yingjiesheng.com
爬取信息:www.yingjiesheng.com 应届生网站下北京/上海/广州三个城市推送的宣讲会以及工作
拿北京举例,爬取后的内容存储在file_beijing_xjh.txt文件中
爬虫爬取信息的过程中,网页被gzip压缩,后来在网上找到了解决方法。
代码如下:
# -*- coding: utf-8 -*-
"""
Created on Fri Apr 7 13:18:02 2017
@author: jcj
"""
import urllib2
import gzip
import StringIO
from bs4 import BeautifulSoup
class JobScrapy():
def __init__(self):
'''初始化函数:初始化内容包括headers'''
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
self.myheaders = { 'User-Agent' : user_agent }
self.myhomeUrl = []
def setUrl(self,strUrl):#该方法用来设置homeUrl
'''设置读取的homepage'''
self.myhomeUrl = strUrl
def load_homePage(self):#通过homeUrl访问网址,并返回访问得到的xml文件
'''加载homepage'''
req = urllib2.Request(url = self.myhomeUrl, data = None, headers = self.myheaders)
home_request = urllib2.urlopen(req)
home_html = home_request.read()
b_gzip = (home_request.info().get('Content-Encoding') == 'gzip')
if b_gzip:
data = StringIO.StringIO(home_html)
gzipper = gzip.GzipFile(fileobj = data)
home_html = gzipper.read()
return home_html
def load_cityPage(self,strCity):
'''加载citypage'''
cityUrl = self.myhomeUrl+'/'+strCity+'/'
req = urllib2.Request(url = cityUrl,data = None, headers = self.myheaders)
city_request = urllib2.urlopen(req)
city_html = city_request.read()
b_gzip = (city_request.info().get('Content-Encoding') == 'gzip')
if b_gzip:
data = StringIO.StringIO(city_html)
gzipper = gzip.GzipFile(fileobj = data)
city_html = gzipper.read()
return city_html
def down_homePage(self,home_html):
'''将读取到的homepage网页内容输入到txt文档中'''
home_soup = BeautifulSoup(home_html,'html.parser')
file_home_page = open("file_home_page.txt","w")
file_home_page.write(str(home_soup))
file_home_page.close()
def down_cityPage(self,city_html,strcity):
city_soup = BeautifulSoup(city_html,'html.parser')
'''将读取到的city网页内容输入到txt文档中'''
filename1 = 'file_'+strcity+'_page'+'.txt'
file_city_page = open(filename1,"w")
file_city_page.write(str(city_soup)) #我们需要将bsObj转化成str型 才能写入file
file_city_page.close()
'''获取宣讲会信息,并将网页内容输入到txt文档中'''
item1 = city_soup.find('ul',{'class':'xjh-c'})
item2 = city_soup.find('ul',{'class':'list'})
item3 = city_soup.find('tbody',{'id':'tb_job_list'})
itemList1 = item1.find_all(target = '_blank')
itemList2 = item2.find_all(target = '_blank')
itemList3 = item3.find_all(target = '_blank')
filename2 = 'file_'+strcity+'_xjh'+'.txt'
file_city_xjh = open(filename2,"w")
file_city_xjh.write('宣讲会信息:'+'\n')
for xjh in itemList1:
file_city_xjh.write(str(xjh.get_text().encode('utf-8'))+':'+str(xjh.get('href'))+'\n')
for xjh in itemList2:
file_city_xjh.write(str(xjh.get_text().encode('utf-8'))+':'+str(xjh.get('href'))+'\n')
file_city_xjh.write('\n' + '工作信息:' + '\n')
for job in itemList3:
file_city_xjh.write(str(job.get_text().encode('utf-8'))+':'+str(job.get('href'))+'\n')
file_city_xjh.close()
def main():
site = 'http://www.yingjiesheng.com'
JS = JobScrapy()
JS.setUrl(site)
home_html = JS.load_homePage()
JS.down_homePage(home_html)
strcities = ['beijing','shanghai','guangzhou']
for strcity in strcities:
city_html = JS.load_cityPage(strcity)
JS.down_cityPage(city_html,strcity)
if __name__ == '__main__':
main()