详情见我的github:https://github.com/Snowing-ST/Statistical-Case-Studies/tree/master/Lab3%20English%20Text%20Processing
1.爬取新华网Business - Finance类别的新闻url
由于新华网是动态网站,不能直接爬取,所以通过检查找到实际保存新闻摘要列表的网址,里面的信息均用json格式保存。
import requests
import json
from selenium import webdriver
from lxml import etree
import time
import os
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer #词形变化
from sklearn.feature_extraction.text import CountVectorizer
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.933.400 QQBrowser/9.4.8699.400',
}
url = 'http://qc.wa.news.cn/nodeart/list?nid=11143416&pgnum=1&cnt=50&tp=1&orderby=1?callback=jQuery111307913713753952403_1522322711817&_=1522322711818'
data = requests.get(url, headers=headers)
data.encoding
html=data.content
dic_html = html[42:(len(html)-2)] #提取json文本
url_num = len(json.loads(dic_html)["data"]["list"]);url_num #50条url
def geturls(i):
return