python爬虫新闻内容_python爬虫:抓取新浪新闻内容(从当前时间到之前某个时间段),并用jieba分词,用于训练自己的分词模型...

1 #-*- coding:utf-8 -*-

2 __author__ = ‘Administrator‘

3

4 importre5 from bs4 importBeautifulSoup6 importurllib.request7 importjieba8 importstring9 importurllib.parse10 from urllib.error importHTTPError,URLError11 importjson12

13 defget_page(num):14 return ("http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1"

15 "||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&"

16 "format=json&page={}&callback=newsloadercallback").format(str(num))17

18 defget_url(page_url):19 #解决请求路径中含义中文或特殊字符

20 page_url=urllib.parse.quote(page_url, safe=string.printable)21 #print(page_url)

22 url_list=[]23 try:24 res =urllib.request.urlopen(page_url)25 exceptHTTPError as e:26 print(‘The server couldn\‘t fulfill the request.‘)27 print(‘Error code:‘, e.code)28 returnurl_list29 exceptURLError as e:30 print(‘We failed to reach a server.‘)31 print(‘Reason:‘, e.reason)32 returnurl_list33 else:34 if res.getcode()==200:35 jsdata=res.read().decode("utf-8")36 ‘‘‘

37 截取url方法一38 ‘‘‘

39 #result=re.findall(‘"url":"http.*?\.s?html"‘,jsdata)#.*后面再加?就可以变成非贪婪模式

40 #for url in result:

41 #url=url.split(":",maxsplit=1)[1]

42 #url=url.replace(‘\\‘,"")

43 #url_list.append(url)

44 ‘‘‘

45 截取url方法二46 ‘‘‘

47 data=jsdata[21:-2]48 data=re.sub(‘\‘‘,‘\"‘,data)49 data=re.sub(r"\\u","",data)50 jsondata=json.loads(data)51 for dat in jsondata["result"]["data"]:52 url_list.append(dat["url"])53 returnurl_list54

55

56

57 defget_context(new_url):58 #解决请求路径中含义中文或特殊字符

59 httpurl=urllib.parse.quote(new_url, safe=string.printable)60 #print(httpurl)

61 #print(type(httpurl))

62 try:63 html=urllib.request.urlopen(httpurl)64 exceptHTTPError as e:65 print(‘The server couldn\‘t fulfill the request.‘)66 print(‘Error code:‘, e.code)67 exceptURLError as e:68 print(‘We failed to reach a server.‘)69 print(‘Reason:‘, e.reason)70 else:71 if html.getcode()==200:72 res=html.read().decode("utf-8")73 #print(res)

74 soup=BeautifulSoup(res,‘html.parser‘)75 #print(soup.prettify)

76 result={}77 result["article"]=‘‘.join([p.text.strip() for p in soup.select(‘#artibody p‘)[:-1]])78 context=result[‘article‘]79 pattern=‘,|。|“|”|?|!|:|《|》|、|;|·|——| |‘|’|,|\?|\.|\!|`|~|\@|\#|\$|%|\^|\&|\*|(|)|\(|\)|-|\_|\+|=|\[|\]|\{|\}|"|\‘|\|\||‘

80 li=re.split(pattern,context)81 #print("li")

82 with open(r".\traindata.txt",‘a‘,encoding=‘utf-8‘) as file:83 for l inli:84 if l!="":85 sentence = " ".join(jieba.cut(l))86 file.write(sentence + ‘\n‘)87

88 if __name__=="__main__":89 for i in range(1,1001):90 print("第 %d 页" %i)91 page_url=get_page(i)92 url_list=get_url(page_url)93 #print(url_list) #[‘"http://news.sina.com.cn/c/nd/2017-06-11/doc-ifyfzhac1171724.shtml"‘, ...],双引号外层还有单引号

94 ifurl_list:95 for url inurl_list:96 #print(eval(url))

97 #print(type(url))

98 #get_context(eval(url))#针对方法一截取url

99 get_context(url)#针对方法二截取url

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值