python课程设计----简单爬虫

import GetHtml as g,SaveData as s

if __name__ == '__main__':
    # 入口地址
    address = ['http://news.zzu.edu.cn/mtzd.htm']
    # 用来存储已经爬取过的地址,防止重复爬取
    bin = []  # 队列 存放的是爬取过的url地址

    while len(address) != 0:
        get = g.GetHtml()
        htmls = []
        for url in address:
            bin.append(url)
            htmls.append(get.gethtml(url))

        for html in htmls:
            save = s.SaveData(html)
            save.save()
            address.extend(save.getOtherUrl())

        address = list(set(address) - set(bin))
from bs4 import BeautifulSoup

# 该类主要解析数据
class SaveData:
    html = ""  #网页源代码

    def __init__(self, html):  #构造方法,需要传入html数据
        print("构造SaveData类")
        self.html = html

    def save(self):
        bs = BeautifulSoup(self.html, "html.parser")  #声明bs对象
        data_title_name = bs.select("div[class='new-center']>h3>a")  #文章标题
        data_title_detailtime = bs.select("div[class='new-date']")  #日期
        for i in range(0, len(data_title_name)):
            print("发布时间:", data_title_detailtime[i].get_text(), end="\t")
            file = open("output.html", "a+", encoding="utf-8")
            print("文章标题:", data_title_name[i].get_text(), end="\n")
            file.write("<p>文章标题:" + data_title_name[i].get_text() + "    " + "发布时间:" + data_title_detailtime[i].get_text() + "\n</p>")
            file.close()
    # 用来获取下面爬取页面的连接
    def getOtherUrl(self):
        bs = BeautifulSoup(self.html, "html.parser")  # 声明bs对象
        nexturl = []
        tem = bs.select("span[class='p_no']>a")
        print(tem)
        for url in tem:
            a = url['href']
            if "../" in a:
                a = str(a)[3:]
            elif "mtzd" in a:
                a = a[5:]
            if a == "mtzd.htm":
                print("http://news.zzu.edu.cn/" + a)
                nexturl.append("http://news.zzu.edu.cn/" + a)
            else:
                nexturl.append("http://news.zzu.edu.cn/mtzd/" + a)
                print("http://news.zzu.edu.cn/mtzd/" + a)

        return nexturl




import urllib.request, urllib.response, urllib.error, urllib.parse

# 该类主要获取数据
class GetHtml:

    def __init__(self):
        print("GetHtml构造")

    def gethtml(self, url): #发送请求方法
        html = ""  # 爬取到的网页源代码
        URL = urllib.parse.quote(url, safe=':/.')  # 爬取的网页url
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                          "Chrome/87.0.4280.88 Safari/537.36 "

        }  # 请求头,防止爬虫拦截
        request = urllib.request.Request(URL, headers=headers, method="GET")  # 封装请求对象
        try:
            response = urllib.request.urlopen(request)
            html = response.read().decode()
        except urllib.error.HTTPError as e:
            print("超时")
        return html

下面是清华大学新闻爬虫保存.docx文件

# -*- coding:utf-8 -*-

from bs4 import BeautifulSoup
import urllib.request, urllib.response, urllib.error, urllib.parse


# 爬取网页函数
def request(url):
    html = ""  # 爬取到的网页源代码
    URL = urllib.parse.quote(url, safe=':/.')  # 爬取的网页url
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"

    }  # 请求头,防止爬虫拦截
    request = urllib.request.Request(URL, headers=headers, method="GET")  # 封装请求对象
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode()
    except urllib.error.HTTPError as e:
        print("超时")
    return html


# 获取数据函数
def getData(urllist, urleds):
    # 需要爬取的是urllist和urleds的差集
    newurl = list(set(urllist) - set(urleds))
    # 退出条件,urllist中没有新的url
    if len(newurl) == 0:
        printok()
        return
    data = []  # 存储每个页面的html数据
    for url in newurl:
        data.append(request(url))  # 讲爬取到数据存入列表中,这样先爬取到的页面下标越小
        # 爬取之后讲地址放入列表
        urleds.append(url)
    # 处理data列表
    for dataone in data:
        # 使用正则分别拿到想要的数据
        bs = BeautifulSoup(dataone, "html.parser")
        data_title_name = bs.select("p[class='bt']")  # 用来获取文章标题,存到列表中

        title_url = bs.select("div[class='news_months']>ul>li>a")

        data_title_detailtime_day = bs.select("div[class='sj']>p")  # 用来获取文章发布时间
        data_title_detailtime_mouth = bs.select("div[class='sj']>span")
        for i in range(0, len(data_title_name)):
            print("发布时间:", data_title_detailtime_mouth[i].get_text() + "." + data_title_detailtime_day[i].get_text(),end="\t")
            file = open("output.html","a+",encoding="utf-8")
            print("文章标题:", data_title_name[i].get_text(), end="\n")
            file.write("<p>文章标题:"+data_title_name[i].get_text()+"\t"+"发布时间:"+data_title_detailtime_mouth[i].get_text() + "." + data_title_detailtime_day[i].get_text()+"\n</p>")
            file.close()
        # 1:获取其他页面的url
        nexturl = []
        tem = bs.select("span[class='p_no']>a")
        print(tem)
        for url in tem:
            a = url['href']
            if "../" in a:
                a = str(a)[3:]
            elif "rcpy" in a:
                a = a[5:]
            if a == "rcpy.htm":
                print("https://www.tsinghua.edu.cn/news/" + a)
                nexturl.append("https://www.tsinghua.edu.cn/news/" + a)
            else:
                nexturl.append("https://www.tsinghua.edu.cn/news/rcpy/" + a)
                print("https://www.tsinghua.edu.cn/news/rcpy/" + a)


    # 最后递归调用
    getData(nexturl, urleds)


def printok():
    print("--------------------------------------------------------------", end="\n")
    print("##############################################################", end="\n")
    print("                                                              ", end="\n")
    print("           $$$$$$$$$$      ###     ###                        ", end="\n")
    print("          $$$$$$$$$$$$     ###    ###                         ", end="\n")
    print("         $############$    ###  ####                          ", end="\n")
    print("         $$$$$$$$$$$$$$    ########                           ", end="\n")
    print("         $$$$$$$$$$$$$$    ###  ###                           ", end="\n")
    print("          $$$$$$$$$$$$     ###    ###                         ", end="\n")
    print("            $$$$$$$$       ###     ###                        ", end="\n")
    print("                                                              ", end="\n")
    print("##############################################################", end="\n")
    print("______________________________________________________________", end="\n")


if __name__ == '__main__':
    # 入口地址
    urls = ['https://www.tsinghua.edu.cn/news/rcpy.htm']
    # 用来存储已经爬取过的地址,防止重复爬取
    urleds = []  # 队列 存放的是爬取过的url地址
    # 调用爬取函数
    getData(urls, urleds)

博客爬虫

# -*- coding:utf-8 -*-

from bs4 import BeautifulSoup
import urllib.request, urllib.response, urllib.error, urllib.parse
from docx import Document
from docx.shared import Inches
import re



# 爬取网页函数
def request(url):
    html = ""  # 爬取到的网页源代码
    URL = urllib.parse.quote(url, safe=':/.')  # 爬取的网页url
    headers = {
        "User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / "
                      "89.0.4389.114Safari / 537.36 "
    }  # 请求头,防止爬虫拦截
    request = urllib.request.Request(URL, headers=headers, method="GET")  # 封装请求对象
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode()
    except urllib.error.HTTPError as e:
        print("超时")
    return html


# 获取数据函数
def getData(urllist, urleds):
    # 需要爬取的是urllist和urleds的差集
    newurl = list(set(urllist) - set(urleds))
    # 退出条件,urllist中没有新的url
    if len(newurl) == 0:
        printok()
        return
    data = []  # 存储每个页面的html数据
    urls = []  # 存储文章真是链接地址
    for url in newurl:
        data.append(request(url))  # 讲爬取到数据存入列表中,这样先爬取到的页面下标越小
        # 爬取之后讲地址放入列表
        urleds.append(url)
    # 处理data列表
    for dataone in data:
        # 使用正则分别拿到想要的数据
        bs = BeautifulSoup(dataone, "html.parser")
        # data_year = bs.select("div[class='collection-title'] > h1[class=archive-year]")
        data_title_url = bs.select("article > header > h2 > a")  # 用来获取文章真实地址

        # 1:获取文章的url
        for url in data_title_url:
            urls.append("https://www.kingname.info" + url['href'])
        data_title_name = bs.select("article > header > h2 > a > span")  # 用来获取文章标题
        data_title_detailtime = bs.select("article > header > div >time")  # 用来获取文章发布时间
        for i in range(0, len(data_title_name)):
            #调用方法,向word写标题
            # print(data_title_detailtime[i].get_text(), end="\t")
            # print(data_title_name[i].get_text(), end="\n")

            # 访问文章的连接,爬取文章的内容
            title_html = request(urls[i])
            s = BeautifulSoup(title_html, "html.parser")
            imgs = s.findAll("img")
            # 解析文章内容
            nav = s.select("div[class='post-body'] > p")
            save_to_doc(data_title_name[i].get_text(), data_title_detailtime[i].get_text(), nav,imgs)
            # for n in nav:
            #     print(n.get_text())
        # 1:获取其他页面的url
        nexturl = []
        tem = bs.select("a[class='page-number']")
        for url in tem:
            nexturl.append("https://www.kingname.info" + url['href'])

    # 最后递归调用
    getData(nexturl, urleds)

def save_to_doc(title,time,plist,imgs):
    doc = Document()
    save_title_name(doc,title,time)
    print(title)
    imgs.remove(imgs[len(imgs) - 1])
    src = []
    for img in imgs:
        src.append(img['src'])
    print(len(src))
    flag = 0
    #保存内容
    for n in plist:
        if str(n).find("img") ==-1:
            #说明是段落
            doc.add_paragraph(n.get_text())
            print(n.get_text())
        else:
            #说明是图片
            if flag < len(src):
                requestimg(doc, src[flag])
                flag = flag+1

    doc.save(title[1:-3]+".docx")

def save_title_name(doc,title,time):
    doc.add_heading(title)
    print(title)
    doc.add_heading(time, level=1)
    print(time)



def requestimg(doc,url):
    print(url)
    URL = urllib.parse.quote(url, safe=':/.')
    print(URL)
    path = ''

    if URL[-5:].find("jpg") ==-1 and URL[-5:].find("png") ==-1 and URL[-5:].find("png") and URL[-5:].find("JPEG"):
        name = URL[-13:]
    else:
        name = URL[-23:]
    if name.find("/") != -1:
        sub = name.index("/")
        name = name[sub:]
    print(name)
    try:
        urllib.request.urlretrieve(URL, name)
    except Exception as e:
        print("图片下载失败")
    try:
        doc.add_picture(name, width=Inches(5))
    except FileNotFoundError as e:
        print("图片路径未找到")
    print("ok")

def test():
    html = request("https://www.kingname.info/2021/02/18/entry-file/")
    bs = BeautifulSoup(html, "html.parser")
    print("<p><img src='xxx'>".find("img"))
    imgs = bs.findAll("img")
    imgs.remove(imgs[len(imgs)-1])
    for img in imgs:
        print(img['src'])



def printok():
    print("--------------------------------------------------------------", end="\n")
    print("##############################################################", end="\n")
    print("                                                              ", end="\n")
    print("           $$$$$$$$$$      ###     ###                        ", end="\n")
    print("          $$$$$$$$$$$$     ###    ###                         ", end="\n")
    print("         $############$    ###  ####                          ", end="\n")
    print("         $$$$$$$$$$$$$$    ########                           ", end="\n")
    print("         $$$$$$$$$$$$$$    ###  ###                           ", end="\n")
    print("          $$$$$$$$$$$$     ###    ###                         ", end="\n")
    print("            $$$$$$$$       ###     ###                        ", end="\n")
    print("                                                              ", end="\n")
    print("##############################################################", end="\n")
    print("______________________________________________________________", end="\n")
if __name__ == '__main__':
    # test()
    # 入口地址
    urls = ['https://www.kingname.info/archives/']
    # 用来存储已经爬取过的地址,防止重复爬取
    urleds = []
    # 调用爬取函数
    getData(urls, urleds)







  • 1
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值