网易云课堂 python网络爬虫实战

最新推荐文章于 2024-08-28 10:44:00 发布
豆乳_艾米
最新推荐文章于 2024-08-28 10:44:00 发布
阅读量2.1k
点赞数
文章标签：网络爬虫 python
本文链接：https://blog.csdn.net/yunini2/article/details/74415294
版权
import requests
newsurl = 'http://news.sina.com.cn/china/'
res = requests.get(newsurl)
res.encoding = 'utf-8'
print res #<Response [200]>
print res.text# 乱码，应为utf-8
print type(res) # <class 'requests.models.Response'>
print res.encoding# ISO-8859-1

from bs4 import BeautifulSoup
html_sample = '\
<html>\
<body>\
<h1 id="title">Hello World</h1>\
<a href="#" class="link">This is link1</a>\
<a href="#link2"class="link">This is link2</a>\
</body>\
</html>'
soup = BeautifulSoup(html_sample)
print type(soup) # <class 'bs4.BeautifulSoup'>
print soup.text # Hello WorldThis is link1This is link2
#取特殊标签和节点
#使用select找出含有h1标签的元素
soup = BeautifulSoup(html_sample)
header = soup.select('h1')
print(header)#  [<h1 id="title">Hello World</h1>]
print header[0]# <h1 id="title">Hello World</h1>
print header[0].text# Hello World
#打[0]是html格式，没有python中list格式
#使用select找出含有a标签的元素
alink = soup.select('a')
print alink
# [<a class="link" href="#">This is link1</a>, <a class="link" href="#link2">This is link2</a>]
for link in alink:
    print link
#==============================================================================
# <a class="link" href="#">This is link1</a>
# <a class="link" href="#link2">This is link2</a>
#==============================================================================
for link in alink:
    print link.text
#==============================================================================
# This is link1
# This is link2
#==============================================================================
#==============================================================================
# 取得含有特定CSS属性的元素
# 使用select找出所有id为title的元素（id前面需要加#）
# 使用select找出所有class为link的元素（class前面需要加.)
#==============================================================================
alink = soup.select('#title')
print alink # [<h1 id="title">Hello World</h1>]
soup = BeautifulSoup(html_sample)
for link in soup.select('.link'):
    print link
#==============================================================================
# <a class="link" href="#">This is link1</a>
# <a class="link" href="#link2">This is link2</a>
#==============================================================================
#select找出所有a tag的href连接,通过href可以链接到其他网页,把里面的属性包装成字典，可用[]提取
alinks = soup.select('a')
for link in alinks:
    print link['href']
#==============================================================================
# #
# #link2
#==============================================================================
a = '<a href="#" qao=123 abc=456> i am a link</a>'
soup2 = BeautifulSoup(a)
print soup2.select('a')[0]# <a abc="456" href="#" qao="123"> i am a link</a>
print soup2.select('a')[0]['abc']#456
print soup2.select('a')[0]['qao']# 123
print soup2.select('a')[0]['href']# #
print soup2.text #  i am a link
#爬取news.sina.com.cn/china/，根据不同html标签取得对应内容
from bs4 import BeautifulSoup
import requests
res = requests.get('http://news.sina.com.cn/china/')
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text)
#取得各个新闻的部分 
for news in soup.select('.news-item'):
     print news
#==============================================================================
# <div class="news-item first-news-item ">
# <h2><a href="http://news.sina.com.cn/c/nd/2017-07-05/doc-ifyhrxsk1800038.shtml" suda-uatrack="key=newschina_index_2014&value=news_link_1" target="_blank">上海一些订餐平台助推私宴牟利 相关部门介入</a></h2>
# <div class="info clearfix ">
# <div class="time">7月5日 09:20</div>
# <div class="action"><a data-id="gn:comos-fyhrxsk1800038:0" href="http://comment5.news.sina.com.cn/comment/skin/default.html?channel=gn&newsid=comos-fyhrxsk1800038&style=0" target="_blank">评论</a><span class="spliter">|</span><span class="bdshare_t bds_tools get-codes-bdshare" data="{text:'上海一些订餐平台助推私宴牟利 相关部门介入',url:'http://news.sina.com.cn/c/nd/2017-07-05/doc-ifyhrxsk1800038.shtml',pic:''}" id="bdshare"><span class="bds_more">分享</span></span></div>
# </div>
# </div>
#==============================================================================
for news in soup.select('.news-item'):
        print news.select('h2')
#==============================================================================
# [<h2><a href="http://news.sina.com.cn/c/nd/2017-07-05/doc-ifyhrxsk1800038.shtml" suda-uatrack="key=newschina_index_2014&value=news_link_1" target="_blank">上海一些订餐平台助推私宴牟利 相关部门介入</a></h2>]
# [<h2><a href="http://news.sina.com.cn/c/nd/2017-07-05/doc-ifyhryex6189357.shtml" suda-uatrack="key=newschina_index_2014&value=news_link_2" target="_blank">四川暴雨来袭 成都打响211座城市桥梁保卫战</a></h2>]
# []
# []
# ...
# [<h2><a href="http://news.sina.com.cn/c/sd/2016-11-27/doc-ifxyawxa2866597.shtml" target="_blank">湖南从严推进县乡人大换届选举:铭记衡阳案教训</a></h2>]
# [<h2><a href="http://news.sina.com.cn/c/sd/2016-11-21/doc-ifxxwrwh4831425.shtml" target="_blank">北京国I国Ⅱ车辆明年2月15日起五环内限行</a></h2>]
#==============================================================================
for news in soup.select('.news-item'):
    if len(news.select('h2')) > 0:#原本打印出h2有空的，舍弃
        print news.select('h2')[0].text
#==============================================================================
# 上海一些订餐平台助推私宴牟利 相关部门介入
# 四川暴雨来袭 成都打响211座城市桥梁保卫战
# 环保部:支持白洋淀治理规划 补齐生态环境短板
# “上海交警”APP短信挪车 暂时不能通知到外牌
# 西北政法大学校长贾宇去职 已在校生活38年
# ...
# 湖南从严推进县乡人大换届选举:铭记衡阳案教训
# 北京国I国Ⅱ车辆明年2月15日起五环内限行
#==============================================================================
for news in soup.select('.news-item'):
    if len(news.select('h2')) > 0:#原本打印出h2有空的，舍弃
        h2 = news.select('h2')[0].text
        a = news.select('a')[0]['href'] # 取链接
        print h2,a
#==============================================================================
# 舰载机飞行员牺牲细节:4.4秒生死瞬间欲救战机 http://news.sina.com.cn/c/sd/2016-11-28/doc-ifxyawxa2907507.shtml
# 贵州童工多因贫困外出打工 有时连吃盐都成问题 http://news.sina.com.cn/o/2016-11-28/doc-ifxyasmv2025198.shtml
# 湖南从严推进县乡人大换届选举:铭记衡阳案教训 http://news.sina.com.cn/c/sd/2016-11-27/doc-ifxyawxa2866597.shtml
# 北京国I国Ⅱ车辆明年2月15日起五环内限行 http://news.sina.com.cn/c/sd/2016-11-21/doc-ifxxwrwh4831425.shtml
#==============================================================================
#加入时间time
for news in soup.select('.news-item'):
    if len(news.select('h2')) > 0:#原本打印出h2有空的，舍弃
        h2 = news.select('h2')[0].text
        time = news.select('.time')[0].text
        a = news.select('a')[0]['href'] # 取链接
        print time,h2,a
#==============================================================================
# 11月28日 07:50 舰载机飞行员牺牲细节:4.4秒生死瞬间欲救战机 http://news.sina.com.cn/c/sd/2016-11-28/doc-ifxyawxa2907507.shtml
# 11月28日 05:41 贵州童工多因贫困外出打工 有时连吃盐都成问题 http://news.sina.com.cn/o/2016-11-28/doc-ifxyasmv2025198.shtml
# 11月27日 14:01 湖南从严推进县乡人大换届选举:铭记衡阳案教训 http://news.sina.com.cn/c/sd/2016-11-27/doc-ifxyawxa2866597.shtml
# 11月21日 15:00 北京国I国Ⅱ车辆明年2月15日起五环内限行 http://news.sina.com.cn/c/sd/2016-11-21/doc-ifxxwrwh4831425.shtml
#==============================================================================
#抓取内文页面
import requests
from bs4 import BeautifulSoup
res = requests.get('http://news.sina.com.cn/c/nd/2017-07-05/doc-ifyhrxsk1791835.shtml')
res.encoding = 'utf-8'
print res.text #成功
soup = BeautifulSoup(res.text)
#抓取文章标题，在<h1 id="artibodyTitle"....>起底。。</h1>中
alink = soup.select('#artibodyTitle')[0].text
print alink # 起底章莹颖案嫌犯所上网站:仍有大量绑架内容
#取得右方的时间和来源，时间在class="time-source"中
time = soup.select('.time-source')[0]
print time
#==============================================================================
# <span class="time-source" id="navtimeSource">2017年07月05日07:57           <span>
# <span data-sudaclick="media_name"><a href="http://www.thepaper.cn/newsDetail_forward_1724885" rel="nofollow" target="_blank">新浪综合</a></span></span>
# </span>
#==============================================================================
#将时间和来源分开

#2017年07月05日07:57                
#新浪综合
timesource = soup.select('.time-source')[0].contents[0].strip()
print timesource # 2017年07月05日07:57
type(timesource) # unicode,不属于字符串，需要转换
import json
print json.dumps(timesource, encoding="UTF-8", ensure_ascii=False)
#==============================================================================
# 时间字符串转换,在python3.X中会默认utf-8格式，需要转换。
# 字符串转时间strptime
# from datetime import datetime
# dt = datetime.strptime(timesource, '%Y年%m月%d日%H:%M')
# dt
# 时间转字符串strftime
# dt.strftime('%Y-%m-%d')
#==============================================================================
medianame = soup.select('.time-source span a')[0].text
print medianame # 新浪综合
print json.dumps(medianame, encoding="UTF-8", ensure_ascii=False)
#取得内文
soup.select('#artibody')
#==============================================================================
# [<div class="article article_16" id="artibody">
#  <p>　　来源：澎湃新闻</p>
#  <p>　　原标题：起底章莹颖案嫌犯所上网站：仍有大量绑架内容，卷入多起刑案</p>
#  <p>　　克里斯滕森的车因与最后载走章莹颖的车很相似而被调查。FBI在检查其手机时发现，他曾在4月访问过一家成人社交网站中的“新手绑架课程”论坛，浏览了“完美绑架幻想”和“计划一场绑架”等帖子。FBI在刑事起诉书中称，这对实施绑架起到了作用。 </p>
#  <p>　　澎湃新闻（www.thepaper.cn）调查发现，涉事网站此前就多次卷入刑事案件，引起了执法部门的注意，但网站上至今仍存在大量关于绑架的内容，并未受到明显影响。网站用户的国际性以及内容的“灰色地带”都给监管带来了难度。 </p>
#  
#==============================================================================
soup.select('#artibody p')#只要p的内容
soup.select('#artibody p')[:-1]# 去掉最后一个p
#==============================================================================
#  <p>　　据报道，法院将于本周三下午再次举行聆讯，不过克里斯滕森的律师布鲁诺表示，“面对这样的控告，总会是一场艰难的战斗。”布罗诺说，在联邦系统中，如果控告涉及暴力犯罪或者武器，被告很少会被取保候审。</p>,
#  <p>　　布鲁诺还呼吁公众以开放心态看待本案，他指出公众目前了解的信息“并不是故事的全部”，还有许多公众没有意识到的信息。</p>,
#  <p>　　来源：澎湃新闻</p>]
#==============================================================================
article = []
for p in soup.select('#artibody p')[:-1]:
    article.append(p.text)
type(article)
print article.encode("utf-8",errors = 'ignore')"# 为什么是这样呢！！！！！！！！！
#==============================================================================
# \u4ed6\u6307\u51fa\u516c\u4f17\u76ee\u524d\u4e86\u89e3\u7684\u4fe1\u606f\u201c
# \u5e76\u4e0d\u662f\u6545\u4e8b\u7684\u5168\u90e8\u201d\uff0c\u8fd8\u6709\u8bb8
# \u591a\u516c\u4f17\u6ca1\u6709\u610f\u8bc6\u5230\u7684\u4fe1\u606f\u3002',
#  u'\u3000\u3000\u6765\u6e90\uff1a\u6f8e\u6e43\u65b0\u95fb']
#==============================================================================
print json.dumps(article, encoding="UTF-8", ensure_ascii=False)
#==============================================================================
# ["　　来源：澎湃新闻", "　　原标题：起底章莹颖案嫌犯所上网站：仍有大量绑架内容，卷入多起刑案", 
# "　　克里斯滕森的车因与最后载走章莹颖的车很相似而被调查。FBI在检查其手机时发现，他曾在4月访问过一家成人社交网站中的“新手绑架课程”论坛，浏览了“完美绑架幻想”和“计划一场绑架”等帖子。FBI在刑事起诉书中称，这对实施绑架起到了作用。 ",
#  "　...]
#==============================================================================
' '.join(article)
[p.text.strip() for p in soup.select('#artibody p')[:-1]]
' '.join([p.text.strip() for p in soup.select('#artibody p')[:-1]])