该楼层疑似违规已被系统折叠 隐藏此楼查看此楼
#coding=utf-8
import chardet #字符集检测
import urllib.parse
import urllib.request
import re
import ssl
#跳过 SSL证书
ssl._create_default_https_context=ssl._create_unverified_context
rr = re.compile(r"\bcharset[=:\"\s]{1,3}([-_A-Z0-9]+)",re.I)
def getCode(string):
p = rr.findall(string)
if len(p)>0:
print(u'编码方式: ' + p[0])
return p[0]
print(u'没找到编码方式')
return ''
#getCode(r'iiifjjd charset:" utf_8iidi-oo">')
def getHtml(url):
headers={
"User-Agent": 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
'Referer': url
}
values = {
'name': 'hao_hao',
'ie': 'utf-8'
}
data = urllib.parse.urlencode(values)
req = urllib.request.Request(url=url+'?'+data, headers=headers)
#req = urllib.request.Request(url+'?'+data)
response = urllib.request.urlopen(req)
#1 从响应头中找编码方式
page = getCode(response.headers['Content-Type'])
#2 从网页源代码中找编码方式
if page == '':
for line in response.readlines():
page = getCode(line.decode())
if page !='': break
the_page = response.read()
#3 chardet字符集检测 进行内容分析. https://mm.taobao.com/search_tstar_model.html GBK 识别成 GB2312 所以不好用. 前两个方法都不行再用
if page =='':
chardit1 = chardet.detect(the_page)
page = chardit1['encoding']
print(u'chardet字符集检测\r\n编码方式: ' + page)
#打印响应头数据.
print(response.headers)
#需要时关闭连接
#response.close()
#都找不到编码方式
if page =='': return ''
return the_page.decode(page) #解码.
#return the_page.decode(page).encode('utf-8')
print ('===============================================')
#gbk
html = getHtml("https://mm.tao[请把这几个字删掉]bao.com/search_tstar_model.html")
print (html)
print ('===============================================')
#utf-8
html = getHtml("http://kyfw.123[请把这几个字删抻]06.cn/otn/leftTicket/init")
print (html)
print ('===============================================')