import GetHtml as g,SaveData as s
if __name__ =='__main__':# 入口地址
address =['http://news.zzu.edu.cn/mtzd.htm']# 用来存储已经爬取过的地址,防止重复爬取
bin =[]# 队列 存放的是爬取过的url地址whilelen(address)!=0:
get = g.GetHtml()
htmls =[]for url in address:
bin.append(url)
htmls.append(get.gethtml(url))for html in htmls:
save = s.SaveData(html)
save.save()
address.extend(save.getOtherUrl())
address =list(set(address)-set(bin))
from bs4 import BeautifulSoup
# 该类主要解析数据classSaveData:
html =""#网页源代码
def __init__(self, html):#构造方法,需要传入html数据print("构造SaveData类")self.html = html
def save(self):bs=BeautifulSoup(self.html,"html.parser")#声明bs对象
data_title_name = bs.select("div[class='new-center']>h3>a")#文章标题
data_title_detailtime = bs.select("div[class='new-date']")#日期for i in range(0,len(data_title_name)):print("发布时间:", data_title_detailtime[i].get_text(), end="\t")
file =open("output.html","a+", encoding="utf-8")print("文章标题:", data_title_name[i].get_text(), end="\n")
file.write("<p>文章标题:"+ data_title_name[i].get_text()+" "+"发布时间:"+ data_title_detailtime[i].get_text()+"\n</p>")
file.close()# 用来获取下面爬取页面的连接
def getOtherUrl(self):bs=BeautifulSoup(self.html,"html.parser")# 声明bs对象
nexturl =[]
tem = bs.select("span[class='p_no']>a")print(tem)for url in tem:
a = url['href']if"../" in a:
a =str(a)[3:]
elif "mtzd" in a:
a = a[5:]if a =="mtzd.htm":print("http://news.zzu.edu.cn/"+ a)
nexturl.append("http://news.zzu.edu.cn/"+ a)else:
nexturl.append("http://news.zzu.edu.cn/mtzd/"+ a)print("http://news.zzu.edu.cn/mtzd/"+ a)return nexturl
import urllib.request, urllib.response, urllib.error, urllib.parse
# 该类主要获取数据classGetHtml:
def __init__(self):print("GetHtml构造")
def gethtml(self, url):#发送请求方法
html =""# 爬取到的网页源代码URL= urllib.parse.quote(url, safe=':/.')# 爬取的网页url
headers ={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ""Chrome/87.0.4280.88 Safari/537.36 "}# 请求头,防止爬虫拦截
request = urllib.request.Request(URL, headers=headers, method="GET")# 封装请求对象try:
response = urllib.request.urlopen(request)
html = response.read().decode()
except urllib.error.HTTPError as e:print("超时")return html
下面是清华大学新闻爬虫保存.docx文件
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import urllib.request, urllib.response, urllib.error, urllib.parse
# 爬取网页函数
def request(url):html=""# 爬取到的网页源代码URL= urllib.parse.quote(url, safe=':/.')# 爬取的网页url
headers ={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}# 请求头,防止爬虫拦截
request = urllib.request.Request(URL, headers=headers, method="GET")# 封装请求对象try:
response = urllib.request.urlopen(request)
html = response.read().decode()
except urllib.error.HTTPError as e:print("超时")return html
# 获取数据函数
def getData(urllist, urleds):# 需要爬取的是urllist和urleds的差集
newurl =list(set(urllist)-set(urleds))# 退出条件,urllist中没有新的urliflen(newurl)==0:printok()return
data =[]# 存储每个页面的html数据for url in newurl:
data.append(request(url))# 讲爬取到数据存入列表中,这样先爬取到的页面下标越小# 爬取之后讲地址放入列表
urleds.append(url)# 处理data列表for dataone in data:# 使用正则分别拿到想要的数据
bs =BeautifulSoup(dataone,"html.parser")
data_title_name = bs.select("p[class='bt']")# 用来获取文章标题,存到列表中
title_url = bs.select("div[class='news_months']>ul>li>a")
data_title_detailtime_day = bs.select("div[class='sj']>p")# 用来获取文章发布时间
data_title_detailtime_mouth = bs.select("div[class='sj']>span")for i in range(0,len(data_title_name)):print("发布时间:", data_title_detailtime_mouth[i].get_text()+"."+ data_title_detailtime_day[i].get_text(),end="\t")
file =open("output.html","a+",encoding="utf-8")print("文章标题:", data_title_name[i].get_text(), end="\n")
file.write("<p>文章标题:"+data_title_name[i].get_text()+"\t"+"发布时间:"+data_title_detailtime_mouth[i].get_text()+"."+ data_title_detailtime_day[i].get_text()+"\n</p>")
file.close()# 1:获取其他页面的url
nexturl =[]
tem = bs.select("span[class='p_no']>a")print(tem)for url in tem:
a = url['href']if"../" in a:
a =str(a)[3:]
elif "rcpy" in a:
a = a[5:]if a =="rcpy.htm":print("https://www.tsinghua.edu.cn/news/"+ a)
nexturl.append("https://www.tsinghua.edu.cn/news/"+ a)else:
nexturl.append("https://www.tsinghua.edu.cn/news/rcpy/"+ a)print("https://www.tsinghua.edu.cn/news/rcpy/"+ a)# 最后递归调用getData(nexturl, urleds)
def printok():print("--------------------------------------------------------------", end="\n")print("##############################################################", end="\n")print(" ", end="\n")print(" $$$$$$$$$$ ### ### ", end="\n")print(" $$$$$$$$$$$$ ### ### ", end="\n")print(" $############$ ### #### ", end="\n")print(" $$$$$$$$$$$$$$ ######## ", end="\n")print(" $$$$$$$$$$$$$$ ### ### ", end="\n")print(" $$$$$$$$$$$$ ### ### ", end="\n")print(" $$$$$$$$ ### ### ", end="\n")print(" ", end="\n")print("##############################################################", end="\n")print("______________________________________________________________", end="\n")if __name__ =='__main__':# 入口地址
urls =['https://www.tsinghua.edu.cn/news/rcpy.htm']# 用来存储已经爬取过的地址,防止重复爬取
urleds =[]# 队列 存放的是爬取过的url地址# 调用爬取函数getData(urls, urleds)
博客爬虫
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import urllib.request, urllib.response, urllib.error, urllib.parse
from docx import Document
from docx.shared import Inches
import re
# 爬取网页函数
def request(url):html=""# 爬取到的网页源代码URL= urllib.parse.quote(url, safe=':/.')# 爬取的网页url
headers ={"User-Agent":"Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / ""89.0.4389.114Safari / 537.36 "}# 请求头,防止爬虫拦截
request = urllib.request.Request(URL, headers=headers, method="GET")# 封装请求对象try:
response = urllib.request.urlopen(request)
html = response.read().decode()
except urllib.error.HTTPError as e:print("超时")return html
# 获取数据函数
def getData(urllist, urleds):# 需要爬取的是urllist和urleds的差集
newurl =list(set(urllist)-set(urleds))# 退出条件,urllist中没有新的urliflen(newurl)==0:printok()return
data =[]# 存储每个页面的html数据
urls =[]# 存储文章真是链接地址for url in newurl:
data.append(request(url))# 讲爬取到数据存入列表中,这样先爬取到的页面下标越小# 爬取之后讲地址放入列表
urleds.append(url)# 处理data列表for dataone in data:# 使用正则分别拿到想要的数据
bs =BeautifulSoup(dataone,"html.parser")# data_year = bs.select("div[class='collection-title'] > h1[class=archive-year]")
data_title_url = bs.select("article > header > h2 > a")# 用来获取文章真实地址# 1:获取文章的urlfor url in data_title_url:
urls.append("https://www.kingname.info"+ url['href'])
data_title_name = bs.select("article > header > h2 > a > span")# 用来获取文章标题
data_title_detailtime = bs.select("article > header > div >time")# 用来获取文章发布时间for i in range(0,len(data_title_name)):#调用方法,向word写标题# print(data_title_detailtime[i].get_text(), end="\t")# print(data_title_name[i].get_text(), end="\n")# 访问文章的连接,爬取文章的内容
title_html =request(urls[i])
s =BeautifulSoup(title_html,"html.parser")
imgs = s.findAll("img")# 解析文章内容
nav = s.select("div[class='post-body'] > p")save_to_doc(data_title_name[i].get_text(), data_title_detailtime[i].get_text(), nav,imgs)# for n in nav:# print(n.get_text())# 1:获取其他页面的url
nexturl =[]
tem = bs.select("a[class='page-number']")for url in tem:
nexturl.append("https://www.kingname.info"+ url['href'])# 最后递归调用getData(nexturl, urleds)
def save_to_doc(title,time,plist,imgs):doc=Document()save_title_name(doc,title,time)print(title)
imgs.remove(imgs[len(imgs)-1])
src =[]for img in imgs:
src.append(img['src'])print(len(src))
flag =0#保存内容for n in plist:ifstr(n).find("img")==-1:#说明是段落
doc.add_paragraph(n.get_text())print(n.get_text())else:#说明是图片if flag <len(src):requestimg(doc, src[flag])
flag = flag+1
doc.save(title[1:-3]+".docx")
def save_title_name(doc,title,time):doc.add_heading(title)print(title)
doc.add_heading(time, level=1)print(time)
def requestimg(doc,url):print(url)URL= urllib.parse.quote(url, safe=':/.')print(URL)
path =''ifURL[-5:].find("jpg")==-1andURL[-5:].find("png")==-1andURL[-5:].find("png")andURL[-5:].find("JPEG"):name=URL[-13:]else:
name =URL[-23:]if name.find("/")!=-1:
sub = name.index("/")
name = name[sub:]print(name)try:
urllib.request.urlretrieve(URL, name)
except Exception as e:print("图片下载失败")try:
doc.add_picture(name, width=Inches(5))
except FileNotFoundError as e:print("图片路径未找到")print("ok")
def test():html=request("https://www.kingname.info/2021/02/18/entry-file/")
bs =BeautifulSoup(html,"html.parser")print("<p><img src='xxx'>".find("img"))
imgs = bs.findAll("img")
imgs.remove(imgs[len(imgs)-1])for img in imgs:print(img['src'])
def printok():print("--------------------------------------------------------------", end="\n")print("##############################################################", end="\n")print(" ", end="\n")print(" $$$$$$$$$$ ### ### ", end="\n")print(" $$$$$$$$$$$$ ### ### ", end="\n")print(" $############$ ### #### ", end="\n")print(" $$$$$$$$$$$$$$ ######## ", end="\n")print(" $$$$$$$$$$$$$$ ### ### ", end="\n")print(" $$$$$$$$$$$$ ### ### ", end="\n")print(" $$$$$$$$ ### ### ", end="\n")print(" ", end="\n")print("##############################################################", end="\n")print("______________________________________________________________", end="\n")if __name__ =='__main__':# test()# 入口地址
urls =['https://www.kingname.info/archives/']# 用来存储已经爬取过的地址,防止重复爬取
urleds =[]# 调用爬取函数getData(urls, urleds)