python爬取学校题库_python爬取软考每日一练试题存入数据库

python源码(无框架)

# -*- coding: utf-8 -*-

import requests

from lxml import etree

import json

import re

import pandas as pd

import numpy as np

import pymysql

cookie = ""

header = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",

"Cookie": cookie,

}

# pymysql 数据库操作

class MysqlAct(object):

def __init__(self):

self.connect = pymysql.connect('localhost', 'root', 'root', 'tpcommon', use_unicode=True, charset='utf8')

self.cursor = self.connect.cursor()

def select(self, sql):

self.cursor.execute(sql)

return self.cursor.fetchall()

def find(self, sql):

self.cursor.execute(sql)

return self.cursor.rowcount

def insert(self, sql):

self.cursor.execute(sql)

self.connect.commit()

def update(self, sql):

self.cursor.execute(sql)

self.connect.commit()

def colose(self):

self.connect.close()

# 爬虫类

class Spider(object):

def __init__(self):

self.page = 1

# 获取原始列表

def GetClist(self,totalpage=2):

mysql = MysqlAct()

while self.page <= totalpage:

print("begin----",self.page)

url=''

classurl = "url-{}.html".format(self.page)

r = requests.get(classurl, headers=header)

html = etree.HTML(r.content)

list = html.xpath("//div[@class='ecv2_tikucom_doItem clearfix']")

for v in list:

title = v.xpath(".//div[@class='ecv2_tikucom_doTitle ecv2_marginbottom16']/text()")[0]

a = v.xpath(".//a//@href")[0]

if (a == 'javascript:;'):

a = v.xpath(".//a//@data-accessid")[0]

id = re.findall(r'\d+', a)

row = (title, id[0])

fields = '''(title, tcid)'''

sql = "insert into fcxlt_a_ruankao_list %s VALUES %s" % (fields, row)

mysql.insert(sql)

self.page = self.page+1

print("over---",self.page)

mysql.colose()

# 获取测试报告列表入库

def GetChecks(self):

# 有404的 未做过容错跑第二遍sql = "select * from fcxlt_a_ruankao_list where checkid is null order by id asc"

sql = "select * from fcxlt_a_ruankao_list order by id asc"

mysql = MysqlAct()

res = mysql.select(sql)

df = pd.DataFrame(res)

df.columns = ['id', 'title','tcid','checkid','checkurl']

url = "https://uc.educity.cn/ucapi/uc/paper/startExam.do"

for i in range(df.shape[0]):

id = df['id'][i]

oldtcid = df['tcid'][i]

r = requests.post(url, data={'tcId': oldtcid, 'model': 'Exam'}, headers=header)

json_a = json.loads(r.content)

newid = json_a['model']['data']

teata = "https://uc.educity.cn/tiku/testReport.html?id=" + str(json_a['model']['data'])

upsql = "UPDATE fcxlt_a_ruankao_list SET checkid = '%d',checkurl='%s' WHERE id = '%d'" % (newid,teata,id)

mysql.update(upsql)

mysql.colose()

# 获取试题答案和列表

def GetQes(self):

sql = "select * from fcxlt_a_ruankao_list order by id asc"

mysql = MysqlAct()

res = mysql.select(sql)

df = pd.DataFrame(res)

df.columns = ['id', 'title', 'tcid', 'checkid', 'checkurl']

url = "https://uc.educity.cn/ucapi/uc/testPaperLog/loadShitiLogByTestId.do"

mysql = MysqlAct()

for i in range(df.shape[0]):

id = df['checkid'][i]

r = requests.post(url, data={'paperLogId': id}, headers=header)

json_a = json.loads(r.content)

checkid = id

for i in range(10):

s = json_a['model'][i]['shiti']

title = "\'" + s['tigan'] + "\'"

ansy = "\'" + s['analysis'] + "\'"

xuanxiang = "\'" + s['questionDelHTMLTag'] + "\'"

answer = "\'" + s['answerStr'] + "\'"

num = s['questionNum']

shitiid = s['id']

sqlf = "select * from fcxlt_a_ruankao_shiti where shitiid = %d" % (shitiid)

# count = mysql.find(sqlf)

# print(count,shitiid)

# print("insert---------", count, shitiid)

rows = (title, ansy, xuanxiang, answer, num, checkid, shitiid)

fields = '''(title,ansy,xuanxiang,answer,num,checkid,shitiid)'''

sql = "insert into fcxlt_a_ruankao_shiti %s VALUES %s" % (fields, rows)

mysql.insert(sql)

mysql.colose()

if __name__ == '__main__':

#获取列表存入数据库

spider = Spider()

# spider.GetClist(30)# 第一步

# spider.GetChecks()# 第2步

# spider.GetQes()# 第三步

mysql建表语句

CREATE TABLE `fcxlt_a_ruankao_list` (

`id` int(11) NOT NULL AUTO_INCREMENT,

`title` varchar(255) DEFAULT NULL,

`tcid` int(11) DEFAULT NULL,

`checkid` int(11) DEFAULT NULL,

`checkurl` varchar(255) DEFAULT NULL,

PRIMARY KEY (`id`)

) ENGINE=MyISAM AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4

CREATE TABLE `fcxlt_a_ruankao_shiti` (

`id` int(11) NOT NULL AUTO_INCREMENT,

`listid` int(11) DEFAULT NULL,

`title` text,

`answer` varchar(255) DEFAULT NULL,

`ansy` text,

`xuanxiang` text,

`num` int(11) DEFAULT NULL,

`checkid` int(11) DEFAULT NULL,

`shitiid` int(11) DEFAULT NULL,

PRIMARY KEY (`id`)

) ENGINE=MyISAM AUTO_INCREMENT=1DEFAULT CHARSET=utf8mb4

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值