以爬取新浪新闻为例
import re
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
def getSoup(newsurl):
res=requests.get(newsurl)
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'html.parser')
return soup
newsurl为新浪新闻sh首页某则新闻的链接
打印出soup查看结构
title=soup.select('title')[0].text
def getArtcle(soup):
article=[]
for p in soup.select('#article p')[:-1]:
article.append(p.text.strip())
return '