记录一下自然语言处理课程的作业。
一、爬取数据集
1.爬取差别比较明显的五个类别
import math
import os
import urllib.request
import re
from bs4 import BeautifulSoup
#需要手动切换类别,进行爬取
#0体育、1娱乐、2教育、3科技、4股票
urllist=['https://sports.163.com/','https://ent.163.com/','https://edu.163.com/',
'https://tech.163.com/','https://money.163.com/stock/']
def get_urls():
url = urllist[4] #修改下标,选择要爬取的类别
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html,'html.parser')
links = soup.find_all(name="a", attrs={
"href": re.compile("https://www.163.com/dy/article/")})
urls = []
for i in links:
urls.append(i.get("href"))
process_urls = list(set(urls))
process_urls.sort(key=urls.index)#得到处理后的所有url
return process_urls
def get_article(urls):
count = 0
for url in urls:
count = count + 1
html = urllib.request.urlopen(url).read().decode("utf-8")
soup = BeautifulSoup(html,'html.parser')
links = soup.find_all(name="p", attrs={
"id": re.compile("^0")