我的第一只Python爬虫

爬取网站:www.yingjiesheng.com

爬取信息:www.yingjiesheng.com 应届生网站下北京/上海/广州三个城市推送的宣讲会以及工作

拿北京举例,爬取后的内容存储在file_beijing_xjh.txt文件中

爬虫爬取信息的过程中,网页被gzip压缩,后来在网上找到了解决方法。

代码如下:

# -*- coding: utf-8 -*-
"""
Created on Fri Apr  7 13:18:02 2017

@author: jcj
"""

import urllib2
import gzip
import StringIO
from bs4 import BeautifulSoup

class JobScrapy():
    
    def __init__(self):
        '''初始化函数:初始化内容包括headers'''
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        self.myheaders = { 'User-Agent' : user_agent }        
        self.myhomeUrl = []
        
    def setUrl(self,strUrl):#该方法用来设置homeUrl
        '''设置读取的homepage'''
        self.myhomeUrl = strUrl

    def load_homePage(self):#通过homeUrl访问网址,并返回访问得到的xml文件
        '''加载homepage'''
        req = urllib2.Request(url = self.myhomeUrl, data = None, headers = self.myheaders)
        home_request = urllib2.urlopen(req)
        home_html = home_request.read()
        b_gzip = (home_request.info().get('Content-Encoding') == 'gzip')
        if b_gzip:
            data = StringIO.StringIO(home_html)
            gzipper = gzip.GzipFile(fileobj = data)
            home_html = gzipper.read()
        return home_html
    
    def load_cityPage(self,strCity):
        '''加载citypage'''
        cityUrl = self.myhomeUrl+'/'+strCity+'/'
        req = urllib2.Request(url = cityUrl,data = None, headers = self.myheaders)
        city_request = urllib2.urlopen(req)
        city_html = city_request.read()
        b_gzip = (city_request.info().get('Content-Encoding') == 'gzip')
        if b_gzip:
            data = StringIO.StringIO(city_html)
            gzipper = gzip.GzipFile(fileobj = data)
            city_html = gzipper.read()
        return city_html
    
    def down_homePage(self,home_html):
        '''将读取到的homepage网页内容输入到txt文档中'''
        home_soup = BeautifulSoup(home_html,'html.parser')
        file_home_page = open("file_home_page.txt","w")
        file_home_page.write(str(home_soup))
        file_home_page.close()
    
    
    def down_cityPage(self,city_html,strcity):
        city_soup = BeautifulSoup(city_html,'html.parser')
        
        '''将读取到的city网页内容输入到txt文档中'''
        filename1 = 'file_'+strcity+'_page'+'.txt'
        file_city_page = open(filename1,"w")
        file_city_page.write(str(city_soup)) #我们需要将bsObj转化成str型 才能写入file
        file_city_page.close()
        '''获取宣讲会信息,并将网页内容输入到txt文档中'''
        item1 = city_soup.find('ul',{'class':'xjh-c'})
        item2 = city_soup.find('ul',{'class':'list'})
        item3 = city_soup.find('tbody',{'id':'tb_job_list'})
        itemList1 = item1.find_all(target = '_blank')
        itemList2 = item2.find_all(target = '_blank')
        itemList3 = item3.find_all(target = '_blank')
        
        filename2 = 'file_'+strcity+'_xjh'+'.txt'
        file_city_xjh = open(filename2,"w")
        file_city_xjh.write('宣讲会信息:'+'\n')
        for xjh in itemList1:
            file_city_xjh.write(str(xjh.get_text().encode('utf-8'))+':'+str(xjh.get('href'))+'\n')
        for xjh in itemList2:
            file_city_xjh.write(str(xjh.get_text().encode('utf-8'))+':'+str(xjh.get('href'))+'\n')
        file_city_xjh.write('\n' + '工作信息:' + '\n')
        for job in itemList3:
            file_city_xjh.write(str(job.get_text().encode('utf-8'))+':'+str(job.get('href'))+'\n')
        file_city_xjh.close()
    
def main():
    site = 'http://www.yingjiesheng.com'
    JS = JobScrapy()
    JS.setUrl(site)
    home_html = JS.load_homePage()
    JS.down_homePage(home_html)
    
    strcities = ['beijing','shanghai','guangzhou']
    for strcity in strcities:
        city_html = JS.load_cityPage(strcity)
        JS.down_cityPage(city_html,strcity)


if __name__ == '__main__':
    main()


  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值