通过爬取华为安全中心的网站,来对查询的url进行分类
爬取的过程中使用了cookie来进行爬取
#coding=utf-8
import requests
a='tinypng.com'
url = 'https://isecurity.huawei.com/sec/web/getUrlInfo.do'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Referer': 'https://isecurity.huawei.com/sec/web/urlClassification.do',
'Origin': 'https: // isecurity.huawei.com',
'X-CSRF-TOKEN': '49F5A67D8FA216212B6551CC3ECACAF37AFD189FBEE6E0FD',
'Cookie':'JSESSIONID=9A3B2115E839B32939FCE7589CCE97AF; OnceCsrfToken=49F5A67D8FA216212B6551CC3ECACAF37AFD189FBEE6E0FD; lan=zh_CN; _ga=GA1.2.2118020189.1543372307; ELOQUA=2A18795A519C42EF8D57F1EA47270CB3; s_fid=2E4A73A84857FD04-17BF1237627AC0ED; _gid=GA1.2.1175897563.1545630474; s_cc=true; Hm_lvt_48e5a2ca327922f1ee2bb5ea69bdd0a6=1545630476; Hm_lpvt_48e5a2ca327922f1ee2bb5ea69bdd0a6=1545630476; source=corp_nav; ic_source=corp_nav_allwayson; ic_medium=hwdc; utag_main=v_id:0167582879a80022068e216a075003073003f06b00bd0$_sn:2$_ss:0$_st:1545632293156$dc_visit:2$vapi_domain:huawei.com$ses_id:1545630473220%3Bexp-session$_pn:2%3Bexp-session$dc_event:2%3Bexp-session$dc_region:ap-northeast-1%3Bexp-session; s_sq=huawei-corporate-prd%252Chuawei-global-prd%3D%2526c.%2526a.%2526activitymap.%2526page%253Dwww.huawei.com%25252Fcn%25252F%2526link%253D%2525E5%25258D%25258E%2525E4%2525B8%2525BA%2525E4%2525BA%252591%2525E6%25259C%25258D%2525E5%25258A%2525A1%2526region%253Dhw1_global_nav%2526pageIDType%253D1%2526.activitymap%2526.a%2526.c%2526pid%253Dwww.huawei.com%25252Fcn%25252F%2526pidt%253D1%2526oid%253Dhttps%25253A%25252F%25252Fwww.huawei.com%25252Fcn%25252F%252523HUAWEI-CLOUD%2526ot%253DA'}
res = requests.post(url,data = {'pageLanguage': 'zh_CN','urlStrInput': 'hicloud.com'},headers=headers)
print(res.status_code)
print(res.content)
# coding:utf-8
import urllib
import urllib2
import cookielib
loginUrl = 'https://isecurity.huawei.com/sec/web/urlClassification.do'
# 创建cookie容器
cj = cookielib.CookieJar()
# 创建支持cookie的opener
handler = urllib2.HTTPCookieProcessor(cj)
opener = urllib2.build_opener(handler)
# 使用此opener来请求url
response = opener.open(loginUrl)
# 查看获取到的cookie
for each in cj:
print each.name, each.value
- 可访问网页的分类。
信息查询类
社区论坛类
搜索类
多媒体类
社交类
邮件类
主站类
百科类
彩票类
自营业务类
文件传输类
- 不可访问的网页分类
安全类
报告类
测试检查类
分析类
分享类
服务类
登陆注册类
脚本语言类
接口类
连接类
配置类
认证安全类
日志类
升级类
数据类
同步类
推送类
网关类
移动类
域名请求类
重定向类
安装包
帮助类
消息类
支付类
传输下载类
云服务、CDN类