在文章《python html抓取,并用re正则表达式解析(一)》中,用的是re正则表达式提取相应的内容,本次引入BeautifulSoup进行提取。
#coding=utf-8
'''
作业1:
url :"http://money.163.com/special/pinglun/"
抓取第一页的新闻信息,并按照以下规格输出。
[
{'title':'生鲜电商为何难盈利?','created_at':'2013-05-03 08:43','url':'http://money.163.com/13/0503/08/8TUHSEEI00254ITK.html'}
{'title':'生鲜电商为何难盈利?','created_at':'2013-05-03 08:43','url':'http://money.163.com/13/0503/08/8TUHSEEI00254ITK.html'}
]
使用beatifulsoup完成作业1的要求.
'''
import urllib.request
from bs4 import BeautifulSoup as bs4
url = 'http://money.163.com/special/pinglun/'
web_page = urllib.request.urlopen(url)
content = web_page.read().decode('gbk')
soup = bs4(content,'html.parser')
basic_soup = soup.find_all(class_="item_top")
result = []
for i in basic_soup:
text = i.text.strip().split('\n')
title = text[0]
created_at = text[-1]
url = i.find('a').get('href')
result.append({'title':title,'created_at':created_at,'url':url})
print (result)
注意点:
>>> basic_soup[0].find(href=re.compile('http'))
<a href="http://money.163.com/16/0425/14/BLGM1PH5002551G6.html">贾跃亭的成功意味着实体失败?</a>
>>> basic_soup[0].find(href=re.compile('.html'))
<a href="http://money.163.com/16/0425/14/BLGM1PH5002551G6.html">贾跃亭的成功意味着实体失败?</a>
>>> basic_soup[0].find(href=re.compile('.html'))
<a href="http://money.163.com/16/0425/14/BLGM1PH5002551G6.html">贾跃亭的成功意味着实体失败?</a>
>>> cc = basic_soup[0].find(href=re.compile('.html'))
>>> type(cc)
<class 'bs4.element.Tag'>
>>> cc.get('href') #此时可以得到href的地址
'http://money.163.com/16/0425/14/BLGM1PH5002551G6.html'
>>>
>>> type(basic_soup[0])
<class 'bs4.element.Tag'>
>>> basic_soup[0].get('href')
>>> dd = basic_soup[0].find(href=re.compile('http'))
>>> dd.get('href')
'http://money.163.com/16/0425/14/BLGM1PH5002551G6.html'
>>> ee = basic_soup[0].find('h2')
>>> ee
<h2><a href="http://money.163.com/16/0425/14/BLGM1PH5002551G6.html">贾跃亭的成功意味着实体失败?</a></h2>
>>> ee.get('href') #此时无法得到href的地址
>>> type(ee)
<class 'bs4.element.Tag'>
ff = basic_soup[0].find('a')
>>> ff
<a href="http://money.163.com/16/0425/14/BLGM1PH5002551G6.html">贾跃亭的成功意味着实体失败?</a>
>>> ff.get('href') #此时可以得到href的地址
'http://money.163.com/16/0425/14/BLGM1PH5002551G6.html'
#由此可见,如果需要获取href的地址,需要通过标签a(find('a'))来获得
>>> zz = basic_soup[0].find(class_='time')
>>> zz
<span class="time">2016-04-25 14:28:18</span>
>>> type(zz)
<class 'bs4.element.Tag'>
>>> zz.text
'2016-04-25 14:28:18'
上面可以优化如下:
#coding=utf-8
import urllib.request
from bs4 import BeautifulSoup as bs4
url = 'http://money.163.com/special/pinglun/'
web_page = urllib.request.urlopen(url)
content = web_page.read().decode('gbk')
soup = bs4(content,'html.parser')
basic_soup = soup.find_all(class_="item_top")
result = []
for i in basic_soup:
url = i.find('a').get('href')
title = i.find('a').text
created_at = i.find(class_='time').text
result.append({'title':title,'created_at':created_at,'url':url})
print (result)