爬取海南大学事务管理系统

最新推荐文章于 2024-09-04 14:27:01 发布

Yeoman92

最新推荐文章于 2024-09-04 14:27:01 发布

阅读量1.2k

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/yeoman92/article/details/81199292

版权

爬虫专栏收录该内容

6 篇文章 0 订阅

订阅专栏

爬取源

首页

url：http://xgxt.hainu.edu.cn/login.htm
这里写图片描述
需要绕过登录，一种方案是获取验证码图片，通过验证码识别包进行识别，但对于该网站的识别率较低。本文采用第二种方案，通过手动输入登录获取登录cookie，然后在爬虫代码中使用该cookie进行信息的爬取。

获取登录cookie

这里写图片描述

注意点

1.该网站数据为js动态加载，获取数据用到了selenium自动化工具来抓取页面信息。
2.需要加header头。

代码

#-*-  coding:utf-8 -*-
from selenium import webdriver
import re
import time
import pymysql
import os

def set_drive(url_request, str_Cookie, phantomjs_path):
    url_request = url_request
    str_Cookie = str_Cookie
    headers={
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Host':'xgxt.hainu.edu.cn',
        # 'Referer': 'http://xgxt.hainu.edu.cn/web/home/index',
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
        'Cookie': str_Cookie,
    }

    for key in headers:
        webdriver.DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.{}'.format(key)] = headers[key]
        webdriver.Proxy

    driver = webdriver.PhantomJS(executable_path=phantomjs_path)
    driver.get(url_request)

    return driver

#获取数据
def get_data(url_request, str_Cookie, phantomjs_path):
    driver = set_drive(url_request = url_request, str_Cookie = str_Cookie, phantomjs_path = phantomjs_path)

    #获取总页数
    str_page_num = driver.find_element_by_xpath(
        '//div[@class="datagrid-pager pagination"]//tr/td[8]').text
    page_num = int(str_page_num.replace('共', '').replace('页', ''))
    # print(page_num)

    # 获取记录数信息
    str_item_num_info = driver.find_element_by_xpath(
        '//div[@class="pagination-info"]').text
    list_item_num_info = re.findall(r"\d+\.?\d*", str_item_num_info)
    tail_num = int(list_item_num_info[1])
    total_num = int(list_item_num_info[2])

    #如果文件存在，则删除
    url_current = driver.current_url.replace('\n', '')  # 用于获得当前页面的URL
    url_current_suffix = url_current[-2:]  # 获得后缀
    path_file = "./data/" + url_current_suffix + ".txt"
    if os.path.exists(path_file):
        os.remove(path_file)

    # 先写入标题行
    with open("./data/" + url_current_suffix + ".txt", "a", encoding='utf8') as txtfile:
        str_write_row = '姓名,性别,手机号码,QQ号码,来源省份,来源市县,学院,专业,班级' + '\n'
        txtfile.write(str_write_row)

    print(str_item_num_info)
    #保存页面数据
    pick_store_data(driver)

    # 不是最后一页
    while tail_num < total_num:
        #获取下一页
        driver.find_element_by_xpath('//span[@class="l-btn-icon pagination-next"]').click()
        time.sleep(2)
        str_item_num_info = driver.find_element_by_xpath('//div[@class="pagination-info"]').text
        list_item_num_info = re.findall(r"\d+\.?\d*", str_item_num_info)
        tail_num = int(list_item_num_info[1])
        total_num = int(list_item_num_info[2])
        print(str_item_num_info)
        # 保存页面数据
        pick_store_data(driver)
        time.sleep(2)

#从页面分拣数据，并保存到本地和数据库
def pick_store_data(driver):
    # 获取tr
    list_tr_obj = driver.find_elements_by_xpath(
        '//div[@class="datagrid-view2"]//table[@class="datagrid-btable"]/tbody/tr')
    count_person = len(list_tr_obj)
    path_tr = '//div[@class="datagrid-view2"]//table[@class="datagrid-btable"]/tbody/tr'

    # 用列表保存数据
    data = []
    for i in range(count_person):
        data_per = []
        for j in range(9):
            path_td = path_tr + '[' + str(i + 1) + ']/td' + '[' + str(j + 1) + ']'
            td_str = driver.find_element_by_xpath(path_td).text
            data_per.append(td_str)
        data.append(data_per)
    print(data)

    ########################### 把数据保存到本地 #############################
    url_current = driver.current_url.replace('\n','')  # 用于获得当前页面的URL
    url_current_suffix = url_current[-2:] #获得后缀
    with open("./data/" + url_current_suffix + ".txt", "a", encoding='utf8') as txtfile:
        for row in data:
            str_write_row  = ''
            for item in row:
                str_write_row += item + ','
            str_write_row = str_write_row[:-1] + '\n'
            txtfile.write(str_write_row)
    #######################################################################

    ########################### 把数据保存到数据库 #############################
    for row in data:
        if url_current_suffix == 'BJ':
            sql = 'INSERT INTO info_bj (姓名,性别,手机号码,QQ号码,来源省份,来源市县,学院,专业,班级) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)'
        elif url_current_suffix == 'SX':
            sql = 'INSERT INTO info_sx (姓名,性别,手机号码,QQ号码,来源省份,来源市县,学院,专业,班级) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)'
        elif url_current_suffix == 'ZY':
            sql = 'INSERT INTO info_zy (姓名,性别,手机号码,QQ号码,来源省份,来源市县,学院,专业,班级) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)'
        cursor.execute(sql, (row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8]))
        conn.commit()
    ##########################################################################

    return data

if __name__ == '__main__':
    # 读取cookie
    str_Cookie = open("./cookie.txt").read().replace('\n', '')
    phantomjs_path = r"D:\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe"

    #三种关系的url
    url_city_county = "http://xgxt.hainu.edu.cn/XS/XSXSXX/TXLB?TX=SX"
    url_major = 'http://xgxt.hainu.edu.cn/XS/XSXSXX/TXLB?TX=ZY'
    url_class = 'http://xgxt.hainu.edu.cn/xs/XSXSXX/TXLB?TX=BJ'

    #建立数据库连接
    conn = pymysql.connect(host='127.0.0.1', user='root', passwd='', db='hndx',charset='utf8')
    # 创建游标
    cursor = conn.cursor(cursor=pymysql.cursors.DictCursor)

    ################################################################################################
    ###############################     #########运行程序################     ######################
    ################################################################################################
    get_data(url_city_county, str_Cookie, phantomjs_path)
    # get_data(url_major, str_Cookie, phantomjs_path)
    # get_data(url_class, str_Cookie, phantomjs_path)
    ################################################################################################
    ################################################################################################
    ################################################################################################

    cursor.close()
    conn.close()