直接贴代码
from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib import request
import pymysql.cursors
import re
num = 0
for i in range(10):
print("当前正处于", (i + 1), "页")
URL = "http://codeforces.com/contests/page/"
URL += str(i+1)
# print(URL , i)
req = request.Request(URL)
req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.12 Safari/537.36")
# 请求URL并把结果用UTF-8编码
resp = urlopen(req).read().decode("UTF-8")
# 使用BeautifulSoup去解析
soup = BeautifulSoup(resp, "html.parser")
# 获取到了datatable的 div 在进里面在第六个 div
# datatable = soup.find("div", {"class": "datatable"}).findAll("div") # 变成列表了 失败
# 读出了当前界面的所有的比赛
data = soup.findAll("tr")[4:-1]
# 丑就丑一点, 先不管了,
for c in data:
# 提取出场次名称
cc = c.find("td").get_text().split()
cc = "".join(cc[0: -5])
# 提取出比赛所对应的链接
url = c.a
# print("http://codeforces.com" + url["href"])
# 打开新的词条链接
zz = "http://codeforces.com" + url["href"]
req = request.Request(zz)
req.add_header("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.12 Safari/537.36")
resp1 = urlopen(req).read().decode("UTF-8")
soup1 = BeautifulSoup(resp1, "html.parser")
# 解析出来的是选择的题目(我觉得不严谨)
data1 = soup1.find("select").findAll("option")
data1 = data1[1:] # 第一个选择是没有用的舍弃
for z in data1:
num += 1
# print(z["value"], " " + z.get_text(), "http://codeforces.com" + url["href"] + "/problem/" + z["value"])
# print(cc, z.get_text(), "http://codeforces.com" + url["href"] + "/problem/" + z["value"])
connection = pymysql.connect(host='localhost',
user='root',
password='admin',
db='intelligence',
charset='utf8mb4'
)
try:
with connection.cursor() as cursor:
# 创建sql语句
sql = "insert into `cf` (`problem`, `problemName`, `problemUrl`) values (%s, %s, %s)"
# 执行sql语句
cursor.execute(sql, (cc, z.get_text(), "http://codeforces.com" + url["href"] + "/problem/" + z["value"]))
# 提交
connection.commit()
finally:
connection.close()
print("当前以添加:", num, "条数据")
print("完成")
codeforces有反爬虫机制
数据库设计: