python爬虫

潘小榭

于 2017-05-25 09:29:40 发布

阅读量358

点赞数

分类专栏： python 文章标签： python 爬虫

本文链接：https://blog.csdn.net/panxiaoxie/article/details/72722259

版权

python 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

# -*- coding: utf-8 -*-

#Author:xiepan

import http.cookiejar
import urllib.request
import re
import urllib
from bs4 import BeautifulSoup

class Spider(object):
    def __init__(self):
        print('获取课程成绩和学分，GO!')

    def get_login(self,login_url,post_data,headers):
        cookie = http.cookiejar.LWPCookieJar()  ###声明一个cookiejar来保存cookie信息
        opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))  ###创建cookie处理器，并通过handler来构建opener
        post_data = urllib.parse.urlencode(post_data).encode(encoding='GBK')
        request = urllib.request.Request(login_url, post_data, headers)
        response = opener.open(request)
        response = opener.open(grade_url)
        return response.read().decode('GBK')

if __name__=='__main__':
    info={}
    ##登录URL
    login_url = 'http://grdms.bit.edu.cn/yjs/login.do'
    ##成绩URL
    grade_url = 'http://grdms.bit.edu.cn/yjs/yanyuan/py/pychengji.do?method=enterChaxun'
    post_data = {'j_username': '2120140383',
                 'j_password': '********',
                 'loginType': '0'
                 }
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
        'Referer': 'http://grdms.bit.edu.cn/yjs/login.jsp',
    }
    my_spider=Spider()
    my_html=my_spider.get_login(login_url,post_data,headers)
    soup = BeautifulSoup(my_html, 'html.parser')
    every_class = soup.find_all('tr', bgcolor="#FFFFFF", height="23")
    for each_class in every_class:
        each_class = str(each_class)
        try:
            myItems = re.findall('<td align="left">(.*?)</td>', each_class, re.S)
            info['title'] = myItems[3]
            info['grade'] = myItems[9]
            print(info)
        except IndexError:
            print('第一栏标题')