爬取源
首页
url:http://xgxt.hainu.edu.cn/login.htm
需要绕过登录,一种方案是获取验证码图片,通过验证码识别包进行识别,但对于该网站的识别率较低。本文采用第二种方案,通过手动输入登录获取登录cookie,然后在爬虫代码中使用该cookie进行信息的爬取。
获取登录cookie
注意点
1.该网站数据为js动态加载,获取数据用到了selenium自动化工具来抓取页面信息。
2.需要加header头。
代码
#-*- coding:utf-8 -*-
from selenium import webdriver
import re
import time
import pymysql
import os
def set_drive(url_request, str_Cookie, phantomjs_path):
url_request = url_request
str_Cookie = str_Cookie
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host':'xgxt.hainu.edu.cn',
# 'Referer': 'http://xgxt.hainu.edu.cn/web/home/index',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
'Cookie': str_Cookie,
}
for key in headers:
webdriver.DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.{}'.format(key)] = headers[key]
webdriver.Proxy
driver = webdriver.PhantomJS(executable_path=phantomjs_path)
driver.get(url_request)
return driver
#获取数据
def get_data(url_request, str_Cookie, phantomjs_path):
driver = set_drive(url_request = url_request, str_Cookie = str_Cookie, phantomjs_path = phantomjs_path)
#获取总页数
str_page_num = driver.find_element_by_xpath(
'//div[@class="datagrid-pager pagination"]//tr/td[8]').text
page_num = int(str_page_num.replace('共', '').replace('页', ''))
# print(page_num)
# 获取记录数信息
str_item_num_info = driver.find_element_by_xpath(
'//div[@class="pagination-info"]').text
list_item_num_info = re.findall(r"\d+\.?\d*", str_item_num_info)
tail_num = int(list_item_num_info[1])
total_num = int(list_item_num_info[2])
#如果文件存在,则删除
url_current = driver.current_url.replace('\n', '') # 用于获得当前页面的URL
url_current_suffix = url_current[-2:] # 获得后缀
path_file = "./data/" + url_current_suffix + ".txt"
if os.path.exists(path_file):
os.remove(path_file)
# 先写入标题行
with open("./data/" + url_current_suffix + ".txt", "a", encoding='utf8') as txtfile:
str_write_row = '姓名,性别,手机号码,QQ号码,来源省份,来源市县,学院,专业,班级' + '\n'
txtfile.write(str_write_row)
print(str_item_num_info)
#保存页面数据
pick_store_data(driver)
# 不是最后一页
while tail_num < total_num:
#获取下一页
driver.find_element_by_xpath('//span[@class="l-btn-icon pagination-next"]').click()
time.sleep(2)
str_item_num_info = driver.find_element_by_xpath('//div[@class="pagination-info"]').text
list_item_num_info = re.findall(r"\d+\.?\d*", str_item_num_info)
tail_num = int(list_item_num_info[1])
total_num = int(list_item_num_info[2])
print(str_item_num_info)
# 保存页面数据
pick_store_data(driver)
time.sleep(2)
#从页面分拣数据,并保存到本地和数据库
def pick_store_data(driver):
# 获取tr
list_tr_obj = driver.find_elements_by_xpath(
'//div[@class="datagrid-view2"]//table[@class="datagrid-btable"]/tbody/tr')
count_person = len(list_tr_obj)
path_tr = '//div[@class="datagrid-view2"]//table[@class="datagrid-btable"]/tbody/tr'
# 用列表保存数据
data = []
for i in range(count_person):
data_per = []
for j in range(9):
path_td = path_tr + '[' + str(i + 1) + ']/td' + '[' + str(j + 1) + ']'
td_str = driver.find_element_by_xpath(path_td).text
data_per.append(td_str)
data.append(data_per)
print(data)
########################### 把数据保存到本地 #############################
url_current = driver.current_url.replace('\n','') # 用于获得当前页面的URL
url_current_suffix = url_current[-2:] #获得后缀
with open("./data/" + url_current_suffix + ".txt", "a", encoding='utf8') as txtfile:
for row in data:
str_write_row = ''
for item in row:
str_write_row += item + ','
str_write_row = str_write_row[:-1] + '\n'
txtfile.write(str_write_row)
#######################################################################
########################### 把数据保存到数据库 #############################
for row in data:
if url_current_suffix == 'BJ':
sql = 'INSERT INTO info_bj (姓名,性别,手机号码,QQ号码,来源省份,来源市县,学院,专业,班级) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)'
elif url_current_suffix == 'SX':
sql = 'INSERT INTO info_sx (姓名,性别,手机号码,QQ号码,来源省份,来源市县,学院,专业,班级) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)'
elif url_current_suffix == 'ZY':
sql = 'INSERT INTO info_zy (姓名,性别,手机号码,QQ号码,来源省份,来源市县,学院,专业,班级) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)'
cursor.execute(sql, (row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8]))
conn.commit()
##########################################################################
return data
if __name__ == '__main__':
# 读取cookie
str_Cookie = open("./cookie.txt").read().replace('\n', '')
phantomjs_path = r"D:\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe"
#三种关系的url
url_city_county = "http://xgxt.hainu.edu.cn/XS/XSXSXX/TXLB?TX=SX"
url_major = 'http://xgxt.hainu.edu.cn/XS/XSXSXX/TXLB?TX=ZY'
url_class = 'http://xgxt.hainu.edu.cn/xs/XSXSXX/TXLB?TX=BJ'
#建立数据库连接
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='', db='hndx',charset='utf8')
# 创建游标
cursor = conn.cursor(cursor=pymysql.cursors.DictCursor)
################################################################################################
############################### #########运行程序################ ######################
################################################################################################
get_data(url_city_county, str_Cookie, phantomjs_path)
# get_data(url_major, str_Cookie, phantomjs_path)
# get_data(url_class, str_Cookie, phantomjs_path)
################################################################################################
################################################################################################
################################################################################################
cursor.close()
conn.close()
原页面结构已变化,以上代码仅供参考。