python html抓取,并用re正则表达式解析(一)

html抓取,并用re进行解析

#coding=utf-8

import urllib.request
import re

'''
url :"http://money.163.com/special/pinglun/"
抓取第一页的新闻信息,并按照以下规格输出。

[
  {'title':'生鲜电商为何难盈利?','created_at':'2013-05-03 08:43','url':'http://money.163.com/13/0503/08/8TUHSEEI00254ITK.html'}

  {'title':'生鲜电商为何难盈利?','created_at':'2013-05-03 08:43','url':'http://money.163.com/13/0503/08/8TUHSEEI00254ITK.html'}
]
'''

url = 'http://money.163.com/special/pinglun/'

result = []

f = urllib.request.urlopen(url)

#<meta http-equiv="Content-Type" content="text/html; charset=gbk">
#因为网页的编码格式是gbk,所以在解码的时候也需要用gbk解码
content = f.read().decode('gbk')
# content = str(f.read(),'utf-8','ignore')


#获取所需内容的模式对象,按此模式从url从获取对应符合的内容
pattern = re.compile(r'<div class="list_item clearfix">.*?</span>',re.S)
#过滤html,得到满足上面模式的内容
basic_content = re.finditer(pattern,content)

#对初步内容进行加工,得到自己想要的title、created_at、url三个内容
for i in basic_content:
	init_dict = {}
	d = re.match(r'<div class="list_item clearfix">.*?<h2><a href="(.*?)">(.*?)</a></h2>.*?<span class="time">(.*?)</span>',i.group(),re.S)
	init_dict['title'] = d.group(2)
	init_dict['created_at'] = d.group(3)
	init_dict['url'] = d.group(1)
	result.append(init_dict)
print (result)

输出内容

[
{'title': '贾跃亭的成功意味着实体失败?', 'created_at': '2016-04-25 14:28:18', 'url': 'http://money.163.com/16/0425/14/BLGM1PH5002551G6.html'}, 
{'title': '海尔模式为何在西方叫好不叫座', 'created_at': '2016-04-22 15:00:23', 'url': 'http://money.163.com/16/0422/15/BL90MCB400253G87.html'}, 
{'title': '有前科就不能开网约车?', 'created_at': '2016-04-12 15:30:49', 'url': 'http://money.163.com/16/0412/15/BKFAETGB002552IJ.html'}, 
{'title': '影业公司能助网络视频抬身价吗', 'created_at': '2016-03-31 13:43:27', 'url': 'http://money.163.com/16/0331/13/BJG7HME600253G87.html'}, 
{'title': '美的收购东芝究竟值不值?', 'created_at': '2016-03-31 08:48:45', 'url': 'http://money.163.com/16/0331/08/BJFMM2AB00253G87.html'}, 
{'title': '日本家电企业真的不行了吗?', 'created_at': '2016-03-18 16:40:02', 'url': 'http://money.163.com/16/0318/16/BIF2FM7A002551G6.html'}, 
{'title': '淘宝只是中国制造乱象的镜子', 'created_at': '2016-03-16 09:56:58', 'url': 'http://money.163.com/16/0316/09/BI96K6L000253G87.html'}, 
{'title': 'iPhone 6s太失败? 苹果需创新', 'created_at': '2016-01-26 14:45:14', 'url': 'http://money.163.com/16/0126/14/BE8V83A500253G87.html'}, 
{'title': '从贴吧事件看大公司如何担责', 'created_at': '2016-01-18 16:02:05', 'url': 'http://money.163.com/16/0118/16/BDKGF2C000253G87.html'},
{'title': '销量不佳股价跌 苹果错在哪里', 'created_at': '2016-01-11 14:49:43', 'url': 'http://money.163.com/16/0111/14/BD2BHH85002551G6.html'},
{'title': '视频网站为何对快播痛下杀手?', 'created_at': '2016-01-11 14:30:31', 'url': 'http://money.163.com/16/0111/14/BD2AEC0E002551G6.html'},
{'title': '黎万强重振小米是个伪命题?', 'created_at': '2016-01-05 13:51:55', 'url': 'http://money.163.com/16/0105/13/BCIPRCDP002551G6.html'},
{'title': '手机厂商频死亡 将大洗牌?', 'created_at': '2015-12-31 12:14:33', 'url': 'http://money.163.com/15/1231/12/BC5O9GEI002551G6.html'}, 
{'title': '2015三星与苹果暗战胜负几何?', 'created_at': '2015-12-29 14:55:41', 'url': 'http://money.163.com/15/1229/14/BC0SN3OC002551G6.html'},
{'title': '宝能作为门口野蛮人是坏人吗', 'created_at': '2015-12-19 12:31:57', 'url': 'http://money.163.com/15/1219/12/BB6SGNBI002551G6.html'}
]

如果解码的时候用的是utf-8,则输出会是乱码。且此时若不加ignore,会报错。

content = f.read().decode('utf-8','ignore')
[
{'title': 'Ծͤijɹζʵʧ?', 'created_at': '2016-04-25 14:28:18', 'url': 'http://money.163.com/16/0425/14/BLGM1PH5002551G6.html'}, {'title': 'ģʽΪкò', 'created_at': '2016-04-22 15:00:23', 'url': 'http://money.163.com/16/0422/15/BL90MCB400253G87.html'},
{'title': 'ǰƾͲܿԼ', 'created_at': '2016-04-12 15:30:49', 'url': 'http://money.163.com/16/0412/15/BKFAETGB002552IJ.html'}, {'title': 'Ӱҵ˾Ƶ̧', 'created_at': '2016-03-31 13:43:27', 'url': 'http://money.163.com/16/0331/13/BJG7HME600253G87.html'}, 
{'title': 'չֵֵ֥', 'created_at': '2016-03-31 08:48:45', 'url': 'http://money.163.com/16/0331/08/BJFMM2AB00253G87.html'}, {'title': 'ձҵҵIJ', 'created_at': '2016-03-18 16:40:02', 'url': 'http://money.163.com/16/0318/16/BIF2FM7A002551G6.html'}, 
{'title': 'Աֻйľ', 'created_at': '2016-03-16 09:56:58', 'url': 'http://money.163.com/16/0316/09/BI96K6L000253G87.html'}, 
{'title': 'iPhone 6s̫ʧ? ƻ贴', 'created_at': '2016-01-26 14:45:14', 'url': 'http://money.163.com/16/0126/14/BE8V83A500253G87.html'}, 
{'title': '¼˾ε', 'created_at': '2016-01-18 16:02:05', 'url': 'http://money.163.com/16/0118/16/BDKGF2C000253G87.html'}, 
{'title': 'ѹɼ۵ ƻ', 'created_at': '2016-01-11 14:49:43', 'url': 'http://money.163.com/16/0111/14/BD2BHH85002551G6.html'}, 
{'title': 'ƵվΪζԿ첥ʹɱ?', 'created_at': '2016-01-11 14:30:31', 'url': 'http://money.163.com/16/0111/14/BD2AEC0E002551G6.html'}, 
{'title': 'ǿСǸα⣿', 'created_at': '2016-01-05 13:51:55', 'url': 'http://money.163.com/16/0105/13/BCIPRCDP002551G6.html'}, 
{'title': 'ֻƵ ϴƣ', 'created_at': '2015-12-31 12:14:33', 'url': 'http://money.163.com/15/1231/12/BC5O9GEI002551G6.html'}, 
{'title': '2015ƻսʤΣ', 'created_at': '2015-12-29 14:55:41', 'url': 'http://money.163.com/15/1229/14/BC0SN3OC002551G6.html'}, 
{'title': 'ΪſҰǻ', 'created_at': '2015-12-19 12:31:57', 'url': 'http://money.163.com/15/1219/12/BB6SGNBI002551G6.html'}
]
Traceback (most recent call last):
  File "test.py", line 21, in <module>
    content = f.read().decode('utf-8')
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc9 in position 167: invalid continuation byte

上面程序多了一些没必要的处理逻辑,可以进行简写,如下:

#coding=utf-8

import urllib.request
import re

url = 'http://money.163.com/special/pinglun/'

result = []

f = urllib.request.urlopen(url)

#<meta http-equiv="Content-Type" content="text/html; charset=gbk">
#因为网页的编码格式是gbk,所以在解码的时候也需要用gbk解码
content = f.read().decode('gbk')
# content = str(f.read(),'utf-8','ignore')


#获取所需内容的模式对象,按此模式从url从获取对应符合的内容
pattern = re.compile(r'<div class="list_item clearfix">.*?<h2><a href="(.*?)">(.*?)</a></h2>.*?<span class="time">(.*?)</span>',re.S)
#过滤html,得到满足上面模式的内容
basic_content = re.finditer(pattern,content)

print (basic_content)
#对初步内容进行加工,得到自己想要的title、created_at、url三个内容
for i in basic_content:
	result.append({'title':i.group(2),'created_at':i.group(3),'url':i.group(1)})
print (result)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值