登录后用二次登录操作将页面下载下来
import requests
from lxml import etree
from PIL import Image
import pytesseract
def html(session):
url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
page_text = session.get(url=url,headers=headers).text
return etree.HTML(page_text)
def dowloud(session):
tree= html(session)
code_img_src='https://so.gushiwen.cn'+tree.xpath('//img[@id="imgCode"]/@src')[0]
img_data = session.get(url=code_img_src,headers=headers).content
with open('./code.png','wb') as fp:
fp.write(img_data)
def distinguish():
img=Image.open('./code.png')
img = img.convert('L')
threshold = 140
table = []
for i in range(256):
if i < threshold:
table.append(0)
else:
table.append(1)
out = img.point(table,'1')
img = img.convert('RGB')
pytesseract.pytesseract.tesseract_cmd = r'D:\tesseract\Tesseract-OCR\tesseract.exe'
return pytesseract.image_to_string(img)
def login(session):
tree = html(session)
viewe=tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0]
viewr=tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')[0]
url='https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx'
img_text = distinguish().replace("\n","")
print(img_text)
data={
'__VIEWSTATE': viewe,
'__VIEWSTATEGENERATOR': viewr,
'from':'http://so.gushiwen.cn/user/collect.aspx',
'email': '2319899766@qq.com',
'pwd': '密码',
'code': img_text,
'denglu': '登录',
}
login_page_text = session.post(url=url,headers=headers,data=data)
print(login_page_text)
urls = 'https://so.gushiwen.cn/user/collect.aspx'
login_text = session.get(url=urls, headers=headers).text
with open('gushi.html', 'w', encoding='utf-8') as fp:
fp.write(login_text)
if __name__ == '__main__':
session = requests.Session()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
dowloud(session)
distinguish()
login(session)