爬取山东工商学院的新闻为例
正则表达式
自己是个非计算机的外行,写一下保存以便于以后复习
from urllib.request import urlopen
from urllib.parse import urljoin
from re import findall, sub, S
from os.path import basename, isdir
from os import mkdir
dstDir = r'D:\山东工商学院新闻'
if not isdir(dstDir):
mkdir(dstDir)
url = r'http://www.sdtbu.edu.cn/info/1043/24108.htm'
while True:
with urlopen(url) as fp:
content = fp.read().decode()
pattern = r'<h1 .*?>(.*?)</h1>'
title = findall(pattern, content)[0]# 获取标题
if not isdir(dstDir+'\\'+title):
mkdir(dstDir+'\\'+title)
pattern = r'<img width=.*?src="(.+?)"'
result = findall(pattern, content)
for picUrl in result:
picUrl = urljoin(url, picUrl)
print(picUrl)
with urlopen(picUrl) as fpUrl:
with open(dstDir+'\\'+title+'\\'+basename(picUrl), 'wb') as fp:
fp.write(fpUrl.read())
pattern = '<p.*?>(.*?)</p>'
result = findall(pattern, content, S)
print(result)
with open(dstDir+'\\'+title+'\\'+title+'.txt', 'w') as fp:
for para in result:
para = sub(r'<.*?>| |【.*?】', '', para).strip()
if para!='' and (not para.startswith(('上一条', '下一条'))):
fp.write(para+'\n')
pattern = r'下一条:<a href="(.*?)"'
try:
nextUrl = findall(pattern, content)[0]
url = urljoin(url, nextUrl)
except:
break
这个比较麻烦,需要一个又一个的分析写正则表达式,但这是爬虫的基础
用beautiful soup
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
url = r'http://www.sdtbu.edu.cn/info/1043/24108.htm'
content = requests.get(url)
content.encoding = 'utf8'
title = BeautifulSoup(content.text, 'lxml').find('h1')
print(title.text)
soup = BeautifulSoup(content.text, 'lxml').find('div', id="vsb_content")
with open(r'C:\Users\Administrator\Desktop\test.txt','w',encoding='utf-8') as fp:
fp.write(soup.text)