beautiful soup 与正则表达式简单比较

最新推荐文章于 2023-10-15 00:10:41 发布

沈帅杰

最新推荐文章于 2023-10-15 00:10:41 发布

阅读量1k

点赞数 1

文章标签： python 正则表达式爬虫

本文链接：https://blog.csdn.net/weixin_45452300/article/details/104757946

版权

爬取山东工商学院的新闻为例
正则表达式
自己是个非计算机的外行，写一下保存以便于以后复习

from urllib.request import urlopen
from urllib.parse import urljoin
from re import findall, sub, S
from os.path import basename, isdir
from os import mkdir
dstDir = r'D:\山东工商学院新闻'
if not isdir(dstDir):
    mkdir(dstDir)
url = r'http://www.sdtbu.edu.cn/info/1043/24108.htm'
while True:
    with urlopen(url) as fp:
        content = fp.read().decode()

    pattern = r'<h1 .*?>(.*?)</h1>'
    title = findall(pattern, content)[0]# 获取标题
    if not isdir(dstDir+'\\'+title):
        mkdir(dstDir+'\\'+title)

    pattern = r'<img width=.*?src="(.+?)"'
    result = findall(pattern, content)
    for picUrl in result:
        picUrl = urljoin(url, picUrl)
        print(picUrl)
        with urlopen(picUrl) as fpUrl:
            with open(dstDir+'\\'+title+'\\'+basename(picUrl), 'wb') as fp:
                fp.write(fpUrl.read())

    pattern = '<p.*?>(.*?)</p>'
    result = findall(pattern, content, S)
    print(result)
    with open(dstDir+'\\'+title+'\\'+title+'.txt', 'w') as fp:
        for para in result:
            para = sub(r'<.*?>|&nbsp;|【.*?】', '', para).strip()
            if para!='' and (not para.startswith(('上一条', '下一条'))):
                fp.write(para+'\n')

    pattern = r'下一条：<a href="(.*?)"'
    try:
        nextUrl = findall(pattern, content)[0]
        url = urljoin(url, nextUrl)
    except:
    break

这个比较麻烦，需要一个又一个的分析写正则表达式，但这是爬虫的基础
用beautiful soup

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

url = r'http://www.sdtbu.edu.cn/info/1043/24108.htm'
content = requests.get(url)
content.encoding = 'utf8'
title = BeautifulSoup(content.text, 'lxml').find('h1')
print(title.text)
soup = BeautifulSoup(content.text, 'lxml').find('div', id="vsb_content")

with open(r'C:\Users\Administrator\Desktop\test.txt','w',encoding='utf-8') as fp:
    fp.write(soup.text)