极客学院单线程爬虫py3.x版本

极客学院单线程爬虫实战篇,老师是用的py2.x版本,而我是3.5版本,因此有一些在解码上的不同。
因此,比照老师的代码写了一遍,放在这里。


#tips:
貌似py3.x在file操作上没有writelines(或许有,我没看到?)但是有readline(字符型)或者readlines(列表型),识别文档的每一行的准测,就是查找’\n’.


# -*- coding: utf-8 -*-
"""
Created on Fri May 13 2016

@author: s
"""
import requests
import re

class spider(object):
    def __init__(self):
        print('Start...')
    #每页的html    
    def get_html(self, url):
        header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'}
        html = requests.get(url, headers=header)
        return html.text
    #总页码的url    
    def get_pages_url(self, url, total_page_nums):
        now_page_num = int(re.search('pageNum=(\d+)', url, re.S).group(1))
        page_link_group = []
        for i in range(now_page_num, total_page_nums+1):
            page_link = re.sub('pageNum=\d+', 'pageNum=%d'%i, url, re.S)
            page_link_group.append(page_link)
        return page_link_group
    #每门课的html      
    def get_class(self, page_html):
        each_class_html = re.findall('<li i(.*?)</li>', page_html, re.S)
        return each_class_html
    #每门课的资料    
        #each_class_html every_class_resource
    def get_info(self, eachclass):
        #eachclass 是each_link_group
        info = {}
        info['title'] = re.search('title="(.*?)"', eachclass, re.S).group(1)
        info['content'] = re.search(' display: none;">(.*?)</p>', eachclass, re.S).group(1)
        time_and_level = re.findall('<em>(.*?)</em>', eachclass, re.S)        
        info['classtime'] = time_and_level[0]
        info['classlevel'] = time_and_level[1]
        info['learnnum'] = re.search('"learn-number">(.*?)</em>', eachclass, re.S).group(1)
        return info

    def save_info(self, classinfo):
        f = open('E:\pyProject\jikexueyuan_code\practise\JK_course_list2.txt', 'wb')

        for each in classinfo:
            f.write('titles: {}\n'.format(each['title']).encode())
            f.write('content: {}\n'.format(str(each['content']).strip()).encode())
            f.write('classtime: {}\n'.format(''.join(str(each['classtime']).split())).encode())
            f.write('classlevel: {}\n'.format(''.join(each['classlevel'].split())).encode())
            f.write('learnnum: {}\n'.format(each['learnnum']).encode())
        f.close()

#自运行        
if __name__ == '__main__':    
    testspider = spider()
    url = 'http://www.jikexueyuan.com/course/?pageNum=1'
    page_html = testspider.get_pages_url(url, 1)
    classinfo = []
    for link in page_html:
        print('ing...' + link)
        html = testspider.get_html(link)
        eachclass = testspider.get_class(html)
        for each in eachclass:
            info = testspider.get_info(each)
            classinfo.append(info)
    testspider.save_info(classinfo)
    #print(classinfo)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值