python爬虫自学3-结果整合加打印-2022-02-10_python爬虫输出整合-CSDN博客

本文链接：https://blog.csdn.net/zhaozhi0810/article/details/122862546

这段代码演示了如何使用Python的requests和BeautifulSoup库从网页抓取大学排名信息，并进行解析。信息包括排名、学校名称、英文名、网址、所属地区等，然后将数据打印并保存到Excel表格中。代码首先尝试读取已下载的网页内容，如果内容不足则重新下载。抓取成功后，数据被组织成列表结构，并写入Excel文件。

摘要由CSDN通过智能技术生成

单个的结果都可以解析出来了，那就把数据整合一下，然后打印出来呗。

看来还是用列表吧。

每个学校一共几项：

1.排名，2.学校中文名，3.学校英文名，4.学校详情网址，5.所属地区，6.类型，7.总分，8.办学层次（默认是这个选项）。

每个学校一个列表，所有的学校再组合成一个大列表。

随便改了一下：仅供小白参考吧。

import requests
from bs4 import BeautifulSoup
import re
import bs4

def getHtmlText(url):
    try:
        r = requests.get(url)
        r.raise_for_status()   # 网页打开失败，触发异常
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "网页打开失败"


def getInfoFromHtml(uLIst,html):
    soup = BeautifulSoup(html,"html.parser")
    # param = re.compile(r"\d{1,3}")
    # param1 = re.compile(r"\d{1,3}.\d?")
    n = 0  # 记录有多少个

    for it in soup.find_all("div",  class_="ranking", string=True):
        list = [re.search(r"\d+",it.string).group()]
        uLIst.append(list)

    n = 0
    for it in soup.find_all("a",  class_="name-cn", string=True):
        uLIst[n].append(it.string)
        n += 1
        # print(it.string)
        # print(it["href"])
    n = 0
    for it in soup.find_all("a", class_="name-en", string=True):
        uLIst[n].append(it.string)
        uLIst[n].append(it["href"])
        # print(it.string)
        #         # print(it["href"])
        n += 1

    n = 0
    for it in soup.find_all("img",alt=True,class_="univ-logo"):
        uLIst[n].append(it["src"])
        # print(it["alt"])
        # print(it["src"])
        # print(it["onerror"])
        n += 1

    n = 0
    for it in soup.find_all("td", class_="align-left"):
        for tag in it.find_next_siblings("td"):
            uLIst[n].append(re.search(r"(\w.?)+",tag.text).group())
            # print(re.search(r"(\w.?)+",tag.text).group())
        n += 1
    # print(uLIst)
    return n


def printList(List,num=20):
    print("{:^10}\t{:^10}\t{:^10}{:^10}\t{:^10}\t{:^10}{:^10}\t{:^10}\t{:^10}".format("排名","学校名称","学校英文名称","详细网址","校标网址","所属地区","类型","综合总分","办学层次"))
    for i in range(num):
        itm = List[i]
        print("{:^10}\t{:^10}\t{:^10}{:^10}\t{:^10}\t{:^10}{:^10}\t{:^10}\t{:^10}".format(itm[0],itm[1],itm[2],itm[3],itm[4],itm[5],itm[6],itm[7],itm[8]))


if __name__ == '__main__':
    infoList = []
    try:
        fp = open("url_save",mode="r",encoding="utf-8")    # 尝试去打开文件，打开失败就重新下载
        html = fp.read()  # 全部读出来
        if len(html) < 1000:    # if 不需要使用括号，读取的内容小于1000个字节，也是重新下载
            raise()     # 人为触发异常

    except:     # 文件打开失败的时候，就触发重新爬取一次
        print("文件打开失败，下载网页")
        url = "https://www.shanghairanking.cn/rankings/bcur/2021"
        html = getHtmlText(url)
        # print(len(html))
        fp = open("url_save", mode="w", encoding="utf-8")  # 把爬取的内容写到文件中，供下一次运行使用
        fp.write(html)
        fp.close()

    if len(html) > 1000:
        num = getInfoFromHtml(infoList, html)
        printList(infoList,num)
    else:
        print("文件内容小于1000个字节，不做解析处理")

打印结果：

粗略一看，好像没啥问题

写入到excel表格中存起来吧。

又去抄了一段代码：

import requests
from bs4 import BeautifulSoup
import re
# import bs4
import xlsxwriter as xw

def getHtmlText(url):
    try:
        r = requests.get(url)
        r.raise_for_status()   # 网页打开失败，触发异常
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "网页打开失败"


def getInfoFromHtml(uLIst,html):
    soup = BeautifulSoup(html,"html.parser")
    # param = re.compile(r"\d{1,3}")
    # param1 = re.compile(r"\d{1,3}.\d?")
    n = 0  # 记录有多少个

    for it in soup.find_all("div",  class_="ranking", string=True):
        list = [re.search(r"\d+",it.string).group()]
        uLIst.append(list)

    n = 0
    for it in soup.find_all("a",  class_="name-cn", string=True):
        uLIst[n].append(it.string)
        n += 1
        # print(it.string)
        # print(it["href"])
    n = 0
    for it in soup.find_all("a", class_="name-en", string=True):
        uLIst[n].append(it.string)
        uLIst[n].append(it["href"])
        # print(it.string)
        #         # print(it["href"])
        n += 1

    n = 0
    for it in soup.find_all("img",alt=True,class_="univ-logo"):
        uLIst[n].append(it["src"])
        # print(it["alt"])
        # print(it["src"])
        # print(it["onerror"])
        n += 1

    n = 0
    for it in soup.find_all("td", class_="align-left"):
        for tag in it.find_next_siblings("td"):
            uLIst[n].append(re.search(r"(\w.?)+",tag.text).group())
            # print(re.search(r"(\w.?)+",tag.text).group())
        n += 1
    # print(uLIst)
    return n


def printList(List,num=20):
    print("{:^10}\t{:^10}\t{:^10}{:^10}\t{:^10}\t{:^10}{:^10}\t{:^10}\t{:^10}".format("排名","学校名称","学校英文名称","详细网址","校标网址","所属地区","类型","综合总分","办学层次"))
    for i in range(num):
        itm = List[i]
        print("{:^10}\t{:^10}\t{:^10}{:^10}\t{:^10}\t{:^10}{:^10}\t{:^10}\t{:^10}".format(itm[0],itm[1],itm[2],itm[3],itm[4],itm[5],itm[6],itm[7],itm[8]))

def saveListToExcel(LIst, fileName):
    workbook = xw.Workbook(fileName)  # 创建工作簿
    worksheet1 = workbook.add_worksheet("sheet1")  # 创建子表
    worksheet1.activate()  # 激活表
    title = ["排名","学校名称","学校英文名称","详细网址","校标网址","所属地区","类型","综合总分","办学层次"]  # 设置表头
    worksheet1.write_row('A1', title)  # 从A1单元格开始写入表头

    k = 2
    for it in LIst:
        row = "A" + str(k)
        worksheet1.write_row(row, it) # 从A2单元格开始写入数据
        k += 1
    workbook.close()  # 关闭表



if __name__ == '__main__':
    infoList = []
    try:
        fp = open("url_save",mode="r",encoding="utf-8")    # 尝试去打开文件，打开失败就重新下载
        html = fp.read()  # 全部读出来
        if len(html) < 1000:    # if 不需要使用括号，读取的内容小于1000个字节，也是重新下载
            raise()     # 人为触发异常

    except:     # 文件打开失败的时候，就触发重新爬取一次
        print("文件打开失败，下载网页")
        url = "https://www.shanghairanking.cn/rankings/bcur/2021"
        html = getHtmlText(url)
        # print(len(html))
        fp = open("url_save", mode="w", encoding="utf-8")  # 把爬取的内容写到文件中，供下一次运行使用
        fp.write(html)
        fp.close()

    if len(html) > 1000:
        num = getInfoFromHtml(infoList, html)
        printList(infoList,num)
        saveListToExcel(infoList, "最好大学.xlsx")
    else:
        print("文件内容小于1000个字节，不做解析处理")

结果如下：