因为工作需要,需从网上爬去大量初高中知识点内容,此前并未接触过爬虫,参考网上的代码进行修改,对51edu网站初高中相关科目知识点进行提取。(仅供记录,不建议参考)
所用工具:requests + beautifulsoup 4 +谷歌浏览器
其中,requests库使用简介方便,用于发出网址请求,beautifulsoup库用于提取数据
代码如下:(任务所需,代码较为粗糙,未进行相应的优化)
# -*-coding:UTF-8 -*-
import requests
from bs4 import BeautifulSoup
#在指定页面获得文本内容
def gethtml(url):
"""
:param url: 指定页面网址
:return: 文本内容
"""
target = url
req = requests.get(url=target)
# 查看网页的编码格式
enconding = requests.utils.get_encodings_from_content(req.text)
# print(enconding) //gb2312
content = req.content.decode('gb2312', errors='ignore')
# print(content)
# 定位到我们需要的内容
bf = BeautifulSoup(content)
texts = bf.find_all('div', class_='conL-1-2')
try:
texts = texts[0].text.replace(' ', '')
except:
pass
try:
texts = texts.replace("\n", '')
except:
pass
# print(texts)
return texts
def geturl(web_link, server):
"""
:param web_link: 对应科目主页面网址
:param server: 网站主网址
:return: 不同专题的url
"""
dic_url = {}
exclude_list = ["高一数学知识点", "高二数学知识点", "高三数学知识点", "高一英语知识点", "高二英语知识点", "高三英语知识点", "高一语文知识点", "高二语文知识点","高三语文知识点","初一数学知识点", "初二数学知识点",
"初三数学知识点","初一英语知识点", "初二英语知识点", "初三英语知识点", "英语知识点", "初一语文知识点","初二语文知识点","初三语文知识点","物理知识点", "化学知识点", "【详情】", "分享", "首页", "上一页", "尾页"]
req = requests.get(url=web_link)
content = req.content.decode('gb2312', errors='ignore')
bf = BeautifulSoup(content)
div = bf.find_all('div', class_='lb-lt')
# print(div[0])
a_bf = BeautifulSoup(str(div[0]))
a = a_bf.find_all('a')
for each in a[5:]:
if each.string in exclude_list:
continue
full_url = server + each.get('href')
title = each.string
dic_url[title] = full_url
print(dic_url)
return dic_url
def contents_save(file_path, content):
"""
:param file_path: 爬取文件保存路径
:param content: 爬取文本文件内容
:return: None
"""
with open(file_path, 'a', encoding="utf-8") as f:
f.write(str(content))
f.write('\n')
#判断是否存在下一页,如果存在,则翻页继续爬取
def judge_nextweb(dic_url):
"""
:param dic_url: 页面内所有专题的url
:return: 下一页的url
"""
for title in dic_url.keys():
if title != "下一页":
url = dic_url[title]
print(title)
in_dic_url = next_url_judge(url)
for titles in in_dic_url:
urls = in_dic_url[titles]
content = gethtml(urls)
contents_save(file_path, content)
content = gethtml(url)
contents_save(file_path, content)
return dic_url["下一页"]
#判断专题内是否存在多页,如果存在则逐页爬取
def next_url_judge(in_url):
"""
:param in_url: 专题url
:return: 页码链接,存在则返回相应页码的url,不存在则返回空
"""
in_dic_url = {}
exclude_list = ([str(i) for i in range(1, 20)])
req = requests.get(url=in_url)
# 查看网页的编码格式
enconding = requests.utils.get_encodings_from_content(req.text)
# print(enconding)
content = req.content.decode('gb2312', errors='ignore')
# print(content)
# 定位到我们需要的内容
bf = BeautifulSoup(content)
text = bf.find_all('div', id="pages")
a_bf = BeautifulSoup(str(text[0]))
a = a_bf.find_all('a')
for each in a:
if each.string in exclude_list:
full_url = server + each.get('href')
title = each.string
in_dic_url[title] = full_url
print(in_dic_url)
return in_dic_url
if __name__ == '__main__':
save_dir = "E:/实习/data/subject_dataset"
subject_file = "senior_chemistry1.txt"
file_path = save_dir + '/' + subject_file
server = "http://www.51edu.com"
web_link = "http://www.51edu.com/gaozhong/gaoyi/huaxue/zhishidian/" #爬取的网址
web_transfer = web_link
cal = 0
while cal < 9: #9为待爬取网页页数
dic_url = geturl(web_transfer, server)
web_transfer = judge_nextweb(dic_url)
cal += 1
print(cal)