1 #-*- coding:utf-8 -*-
2 __author__ = ‘Administrator‘
3
4 importre5 from bs4 importBeautifulSoup6 importurllib.request7 importjieba8 importstring9 importurllib.parse10 from urllib.error importHTTPError,URLError11 importjson12
13 defget_page(num):14 return ("http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1"
15 "||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&"
16 "format=json&page={}&callback=newsloadercallback").format(str(num))17
18 defget_url(page_url):19 #解决请求路径中含义中文或特殊字符
20 page_url=urllib.parse.quote(page_url, safe=string.printable)21 #print(page_url)
22 url_list=[]23 try:24 res =urllib.request.urlopen(page_url)25 exceptHTTPError as e:26 print(‘The server couldn\‘t fulfill the request.‘)27 print(‘Error code:‘, e.code)28 returnurl_list29 exceptURLError as e:30 print(‘We failed to reach a server.‘)31 print(‘Reason:‘, e.reason)32 returnurl_list33 else:34 if res.getcode()==200:35 jsdata=res.read().decode("utf-8")36 ‘‘‘
37 截取url方法一38 ‘‘‘
39 #result=re.findall(‘"url":"http.*?\.s?html"‘,jsdata)#.*后面再加?就可以变成非贪婪模式
40 #for url in result:
41 #url=url.split(":",maxsplit=1)[1]
42 #url=url.replace(‘\\‘,"")
43 #url_list.append(url)
44 ‘‘‘
45 截取url方法二46 ‘‘‘
47 data=jsdata[21:-2]48 data=re.sub(‘\‘‘,‘\"‘,data)49 data=re.sub(r"\\u","",data)50 jsondata=json.loads(data)51 for dat in jsondata["result"]["data"]:52 url_list.append(dat["url"])53 returnurl_list54
55
56
57 defget_context(new_url):58 #解决请求路径中含义中文或特殊字符
59 httpurl=urllib.parse.quote(new_url, safe=string.printable)60 #print(httpurl)
61 #print(type(httpurl))
62 try:63 html=urllib.request.urlopen(httpurl)64 exceptHTTPError as e:65 print(‘The server couldn\‘t fulfill the request.‘)66 print(‘Error code:‘, e.code)67 exceptURLError as e:68 print(‘We failed to reach a server.‘)69 print(‘Reason:‘, e.reason)70 else:71 if html.getcode()==200:72 res=html.read().decode("utf-8")73 #print(res)
74 soup=BeautifulSoup(res,‘html.parser‘)75 #print(soup.prettify)
76 result={}77 result["article"]=‘‘.join([p.text.strip() for p in soup.select(‘#artibody p‘)[:-1]])78 context=result[‘article‘]79 pattern=‘,|。|“|”|?|!|:|《|》|、|;|·|——| |‘|’|,|\?|\.|\!|`|~|\@|\#|\$|%|\^|\&|\*|(|)|\(|\)|-|\_|\+|=|\[|\]|\{|\}|"|\‘|\|\||‘
80 li=re.split(pattern,context)81 #print("li")
82 with open(r".\traindata.txt",‘a‘,encoding=‘utf-8‘) as file:83 for l inli:84 if l!="":85 sentence = " ".join(jieba.cut(l))86 file.write(sentence + ‘\n‘)87
88 if __name__=="__main__":89 for i in range(1,1001):90 print("第 %d 页" %i)91 page_url=get_page(i)92 url_list=get_url(page_url)93 #print(url_list) #[‘"http://news.sina.com.cn/c/nd/2017-06-11/doc-ifyfzhac1171724.shtml"‘, ...],双引号外层还有单引号
94 ifurl_list:95 for url inurl_list:96 #print(eval(url))
97 #print(type(url))
98 #get_context(eval(url))#针对方法一截取url
99 get_context(url)#针对方法二截取url