以汽车之家为例子,抓取页面并进行解析
# -*- coding=utf-8 -*-
import urllib2
from BeautifulSoup import BeautifulSoup as bs3
import json
import codecs
#字符检测,用来检测其真实的编码格式
import chardet
#save content to file
def save_to_file(filename, content):
f = open(filename, 'w+')
assert(f)
f.write(content)
f.close()
def parse_json_data(content):
print(chardet.detect(content[0]))
name_list = ['keyLink', 'config', 'option','color', 'innerColor']
print(json.dumps(content[0].decode('GB2312')))
def parse_content(content):
#content是GB2312的编码
soup = bs3(content)
key_text = 'var levelId'
elem_lib = soup.find('script', text=lambda(x):key_text in x)
#str_script是utf-8的编码
str_script = str(elem_lib.string)
#print(chardet.detect(str_script))
#由于命令行是cp936 GBK的编码,如果编码不符合无法打印
strGBK = str_script.decode('utf-8').encode('gb2312')
#print(strGBK)
#移除html的转义字符
strGBK = strGBK.replace(' ','')
d = strGBK.splitlines()
list_data = []
for i in d:
if i.isspace():
continue
#过滤不需要的变量
if len(i) < 100:
continue
#取出json数据
idx = i.find('{')
if idx == -1:
continue
#移除最后的;
k = i[idx:-1]
list_data.append(k)
parse_json_data(list_data)
'''
print('json.count=', len(list_data))
for i in list_data:
if len(i) > 200:
print(i[0:200])
else:
print(i)
parse_json_data(list_data)
'''
#不能再函数中直接使用exec,但是可以使用eval
'''
strSentece = ''
for i in d:
if i.isspace():
continue
if 'null' in j:
continue
#移除var的类型定义,javascript需要,python不需要
j = i[4:]
strSentece += i
#可以直接在python中执行json的赋值语句,类似dict赋值
exec(strSentece)
#输出变量数据
var_list = ['keyLink', 'config','option','color','innerColor']
for i in var_list:
exec('print %s' % (i,))
'''
def crawler_4_autohome():
autohome_url = 'http://car.autohome.com.cn/config/series/657.html'
#uft-8
content = urllib2.urlopen(url=autohome_url).read()
#print(chardet.detect(content))
parse_content(content)
if __name__ == '__main__':
crawler_4_autohome()