想做新闻的文本分类,先爬了一些搜狐网的新闻做数据集
#_*_coding:utf-8_*_
import requests
from bs4 import BeautifulSoup
import os
def geturl(URL):
baseurl='http://news.sohu.com/guoneixinwen_%d.shtml'%URL
request = requests.get(url=baseurl)
respons = request.content #得到页面源代码
soup = BeautifulSoup(respons,'html.parser') #解析源代码
pagelist=soup.select('div.article')
urllist=[]
for page in pagelist:
url=page.select('a')[1].attrs['href']
urllist.append(url)
return urllist
def get_xinwen(URL):
urllist=geturl(URL)
contentlist=[]
for url in urllist:
request = requests.get(url=url)
respons = request.content #得到页面源代码
soup = BeautifulSoup(respons,'html.parser') #解析源代码
try:
page=soup.select('div#contentText')[0]
# page=str(page)
# pattern =re.compile(u'[\u4e00-\u9fa5]+')
# content=pattern.search(page)
content=page.text
contentlist.append(content)
except IndexError:
pass
return contentlist
if __name__=="__main__":
for num in range(12081,12181):
a=get_xinwen(num)
lenth=len(a)
os.mkdir('./%d'%num)
for i in range(lenth):
f=open('./%d/%d.txt'%(num,i),'w',encoding='utf-8')
f.write(a[i]) #写入txt