'''
import gzip
import sys
import re
import urllib.request
import urllib.parse
import urllib.error
import http.cookiejar
from bs4 import BeautifulSoup
import datetime
import random
pages = set()
random.seed(datetime.datetime.now())
#获取页面所有内链的列表
def getInternalLinks(bsObj, includeUrl):
internalLinks = []
#找出所有已“/”开头的连接
for link in bsObj.findAll('a', href=re.compile('^(/|.*'+includeUrl+')')):
if link.attrs['href'] is not None:
if link.attrs['href'] not in internalLinks:
internalLinks.append(link.attrs['href'])
return internalLinks
'http 方式模拟登陆网站'
def ungzip(data):
try:
print("正在解压.....")
data = gzip.decompress(data)
print("解压成功")
except:
print("未经压缩,无需解压")
return data
LoginUrl = "http://网址.com:9090/jsFrame/login.aspx?login=login"
headers = {
'Accept':'text/html, application/xhtml+xml, */*',
'Referer':'http://网址.com:9090/jsFrame/login.aspx?login=login',
'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
'Content-Type':'application/x-www-form-urlencoded',
'Accept-Encoding':'gzip, deflate',
'Host':'网址.com:9090',
'Connection':'Keep-Alive',
'Pragma':'no-cache'
}
__VIEWSTATE='/wEPDwUKMTgyNjAzNjE0MQ9kFgICAQ9kFgICCQ8QDxYCHgdWaXNpYmxlaGRkZGQYAQUeX19Db250cm9sc1JlcXVpcmVQb3N0QmFja0tleV9fFgEFEmNoa1JlbWViZXJQYXNzd29yZGHjnndASufNAaraxhc4Fq1KydHN'
__EVENTVALIDATION='/wEWBgKthPnxBQLT8dy8BQKd+7qdDgK1qbSRCwLPx7zUAgLf2eqGAzVtS60EPvfNOGv+JEjkKNxzBqnS'
txtUserID='123'
txtPwd='密码'
txtPassword='123'
postDict = {
'__VIEWSTATE':__VIEWSTATE,
'__EVENTVALIDATION':__EVENTVALIDATION,
'txtUserID':txtUserID,
'txtPwd':txtPwd,
'txtPassword':txtPassword,
'Image1':' '
}
#将http内容合并码
#The urllib.parse.urlencode() function takes a mapping or sequence of 2-tuples
#and returns an ASCII string in this format. It should be encoded to bytes before being used as the data parameter.
postdata=urllib.parse.urlencode(postDict).encode()
cookie_filename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar(cookie_filename)
cookie_support = urllib.request.HTTPCookieProcessor(cookie)
#创建一个带有cookie的opener
opener = urllib.request.build_opener(cookie_support)
#将url,http头和http内容放到requet中
request = urllib.request.Request(LoginUrl, data=postdata, headers=headers)
try:
#模拟浏览器发送请求,并获取返回结果
response = opener.open(request)
#将返回结果解压
response = ungzip(response.read())
#将返回结果解码
page = response.decode()
#print(page)
bsObj = BeautifulSoup(page,"html.parser")
for link in bsObj.findAll('iframe'):
print(link)
except urllib.error.URLError as e:
print(e.code,':',e.reason)
cookie.save(ignore_discard=True, ignore_expires=True) # 保存cookie到cookie.txt中
print(cookie)
for item in cookie:
print('Name = ' + item.name)
print('Value = ' + item.value)
#$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
t_headers = {
'Accept':'*/*',
'Accept-Language':'zh-cn',
'Referer':'http://erp.sciyon.com:9090/NM/JsFrame/HomeShow/Inform.aspx?title=新闻公告&homeitemid=101',
'x-requested-with':'XMLHttpRequest',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Accept-Encoding':'gzip, deflate',
'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
'Host':'erp.sciyon.com:9090',
'DNT':'1',
'Connection':'Keep-Alive',
'Pragma':'no-cache'
}
t_url = 'http://erp.sciyon.com:9090/NM/Proxy/NoticeProxy.aspx'
Data = '<Data>'
Data2 = '</Data>'
Action = '<Action>'
Action2 = '</Action>'
TYPE = '<TYPE>'
TYPE2 = '</TYPE>'
STATE = '<STATE>'
STATE2 = '</STATE>'
AUTHORIZATION = '<AUTHORIZATION>'
AUTHORIZATION2 = '</AUTHORIZATION>'
HOMEPAGEID = '<HOMEPAGEID>'
HOMEPAGEID2 = '</HOMEPAGEID>'
#get_postDict = Data+Action+'GETNOTICEDATABYWHERE'+Action2+TYPE+TYPE2+STATE+'APPROVE'+STATE2+AUTHORIZATION+'1'+AUTHORIZATION2+HOMEPAGEID+'101'+HOMEPAGEID2+Data2
get_postDict = '''<Data><Action>GETNOTICEDATABYWHERE</Action><TYPE></TYPE><STATE>APPROVE</STATE><AUTHORIZATION>1</AUTHORIZATION><HOMEPAGEID>101</HOMEPAGEID></Data>'''
#get_postdata=urllib.parse.urlencode(get_postDict).encode()
#get_request = urllib.request.Request(t_url,headers=t_headers,data=get_postdata)
#这时openner对象中应该含有前面获取到的cookie信息
try:
#模拟浏览器发送请求,并获取返回结果
get_response = opener.open(t_url,get_postDict.encode('utf-8'))
#将返回结果解压
get_response = ungzip(get_response.read())
#将返回结果解码
page = get_response.decode()
print(page)
#bsObj = BeautifulSoup(page,"html.parser")
'''
for link in bsObj.findAll('a'):
print(link)
'''
except urllib.error.URLError as e:
print(e.code,':',e.reason)
print("*********************************************************")
#$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
#----------------------------------------------------------------
#利用oookie请求访问另外一个网址
'''
get_headers = {
'Accept':'*/*',
'Accept-Language':'zh-cn',
'Referer':'http://IP/ERP_OA/WorkTask/TaskQuery/ListPage.aspx',
'x-requested-with':'Ext.basex',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Accept-Encoding':'gzip, deflate',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
'Host':'IP',
'Connection':'Keep-Alive',
'Pragma':'no-cache'
}
get_url = 'http://IP/ERP_OA/WorkTask/TaskQuery/ListPage.aspx?FlowGuid=8D11A66F0EAF44FCBBD9DEBEE6D45BFE'
start = 0
limit = 30
SORTFIELD = 'FTASKID'
SORTTYPE = 'DESC'
ACTION = 'GETWORKTASK'
QUERYPARA = '%3CData%3E%3CQueryPara%3E%3CBEGDATE%3E2017-01-01%3C%2FBEGDATE%3E%3CENEDATE%3E2017-01-13%3C%2FENEDATE%3E%3CFCLASS%3E%3C%2FFCLASS%3E%3CFTYPE%3E%3C%2FFTYPE%3E%3CFDELAY%3E%3C%2FFDELAY%3E%3CFTITLE%3E%3C%2FFTITLE%3E%3CFEEDBACKID%3E%3C%2FFEEDBACKID%3E%3CFSUSER%3E%3C%2FFSUSER%3E%3CFRUSER%3E%3C%2FFRUSER%3E%3CSTATE%3E%3C%2FSTATE%3E%3C%2FQueryPara%3E%3C%2FData%3E'
get_postDict = {
'start':start,
'limit':limit,
'SORTFIELD':SORTFIELD,
'SORTTYPE':SORTTYPE,
'ACTION':ACTION,
'QUERYPARA':QUERYPARA
}
get_postdata=urllib.parse.urlencode(postDict).encode()
get_request = urllib.request.Request(get_url,get_postdata,headers=get_headers)
#这时openner对象中应该含有前面获取到的cookie信息
try:
#模拟浏览器发送请求,并获取返回结果
get_response = opener.open(get_request)
#将返回结果解压
get_response = ungzip(get_response.read())
#将返回结果解码
page = get_response.decode()
print(page)
#bsObj = BeautifulSoup(page,"html.parser")
for link in bsObj.findAll('a'):
print(link)
except urllib.error.URLError as e:
print(e.code,':',e.reason)
#get_response = ungzip(opener.open(get_request).read())
#print(get_response.decode())
#------------------------------------------------------------------------------------------------------------------
'''