一个爬虫我认为是万能的代码模板

最新推荐文章于 2024-07-04 11:25:23 发布

爱雨天

最新推荐文章于 2024-07-04 11:25:23 发布

阅读量690

点赞数 1

文章标签：正则表达式 python

本文链接：https://blog.csdn.net/weixin_52634719/article/details/120091792

版权





from bs4 import BeautifulSoup     #网页解析，获取数据
import re       #正则表达式，进行文字匹配
import urllib.request,urllib.error      #制定URL，获取网页数据
import xlwt     #进行excel操作
import sqlite3  #进行SQLite数据库操作






def main():

    url = "https://www.fosu.edu.cn/eie/category/yjsgz/dsfc/page/2"
    html = askURL(url)
    print(html)
    # jobURLs = getURLs(pagenum)
    # for url in jobURLs:
    #     getDate(url)
    #
    #     print(datalist)
    #     saveData2DB()

    # 得到指定一个URL的网页内容
def askURL(url):
        head = {  # 模拟浏览器头部信息，向豆瓣服务器发送消息
            "User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122  Safari / 537.36"
        }
        # 用户代理，表示告诉豆瓣服务器，我们是什么类型的机器、浏览器（本质上是告诉浏览器，我们可以接收什么水平的文件内容）

        request = urllib.request.Request(url, headers=head)
        html = ""
        try:
            response = urllib.request.urlopen(request)
            html = response.read().decode("utf-8")
            # print(html)
        except urllib.error.URLError as e:
            if hasattr(e, "code"):
                print(e.code)
            if hasattr(e, "reason"):
                print(e.reason)

        return html


if __name__ == "__main__":
    main()

这图是我爬取电信学院研究生导师时的学以致用的证明

想爬什么只需修改下网页就行，还要注意网页是不是utf-8的形式，不是就改为gdk

以下是我同学做的，他还弄出了个excle，他把信息和网站爬到里面去了

# -*- codeing = utf-8 -*-

from bs4 import BeautifulSoup   # 网页解析，获取数据
import re    # 正则表达式，进行文字匹配
import urllib.request,urllib.error  # 制定URL，获取网页数据
import xlwt  # 进行excel操作
import sqlite3  # 进行SQLilte数据库操作

# 存放所有详情页的超链接
chao = []

# 正则
# 例子 <a href="https://www.fosu.edu.cn/eie/yjsgz/dsfc/7463.html" title="齐浩亮">
findLink = re.compile(r'<a href="(.*?)" title=".*?">')
findImg = re.compile(r'<img.*?src="(.*?)".*?/>')
findName = re.compile(r'<a href=".*title="(.*?)">')


def main():
    baseurl = "https://www.fosu.edu.cn/eie/category/yjsgz/dsfc"
    datalist = getData(baseurl)
    savepath = "电信学院研究生导师.xls"
    saveData(datalist, savepath)

def getData(baseurl):
    datalist = []
    for i in range(0,2):
        url = baseurl + str(i*("/page/2"))
        html = askURL(url)

        soup = BeautifulSoup(html,"html.parser")
        for item in soup.find_all('div',class_="pullimg"):
            # print(item) #测试 得到电信学院的所有研究生导师的姓名，详情页超链接，照片
            data = []
            item = str(item)
            link = re.findall(findLink, item)
            data.append(link)
            #print(link)   #测试 结果之一:['https://www.fosu.edu.cn/eie/yjsgz/dsfc/7463.html']
            chao = link # 吧详情页链接存起来，后面爬取数据用
            img = re.findall(findImg, item)
            data.append(img)
            #print(img)
            name = re.findall(findName, item)
            data.append(name)
            #print(name)

            datalist.append(data)   # 将每一个导师信息都存入datalist中

    #print(datalist)
    number = len(datalist)  # 统计导师个数
    print("导师个数", number)
    return datalist


# 得到一个url的内容
def askURL(url):
    head = {         # 模拟浏览器头部信息，想豆瓣服务器发送消息
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
    }       # 用户代理，表示高数豆瓣服务器，我们是什么类型的及其，浏览器（本质上是高数浏览器，我们可以接受什么水平的文件内容）

    request = urllib.request.Request(url, headers=head)
    html = ""
    try:
        reponse = urllib.request.urlopen(request)
        html = reponse.read().decode("utf-8")
        #print(html)
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
    return html





def saveData(datalist, savepath):
    book = xlwt.Workbook(encoding="utf-8",style_compression=0)
    sheet = book.add_sheet('电信学院研究生导师', cell_overwrite_ok=True)
    col = ("导师姓名", "导师介绍详情页", "导师照片")
    for i in range(0,3):
        sheet.write(0, i, col[i])
    for i in range(0, len(datalist)):
        print("第%d条" %i)
        data = datalist[i]
        for j in range(0, 3):
            sheet.write(i+1,j,data[j])

    book.save("电信学院研究生导师.xls")

if __name__ == '__main__':
    main()

以下是excle表格

爱雨天

关注

1
点赞
踩
9

收藏

觉得还不错? 一键收藏
0
评论
一个爬虫我认为是万能的代码模板

这图是我爬取电信学院研究生导师时的学以致用的证明from bs4 import BeautifulSoup #网页解析，获取数据import re #正则表达式，进行文字匹配import urllib.request,urllib.error #制定URL，获取网页数据import xlwt #进行excel操作import sqlite3 #进行SQLite数据库操作def main(): url = "https://www..
复制链接

扫一扫