import requests
from lxml import etree
import ddddocr
if __name__=="__main__":
session = requests.Session()#用session可以模拟,但是用单纯用request不行,因为他们的区别仅仅在于requests.get每次都自动关闭连接。而session.get不关闭连接
url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36 Edg/91.0.864.48'
}
respone_text = session.get(url=url,headers=headers).text
tree = etree.HTML(respone_text)
new_url = 'https://so.gushiwen.cn' + tree.xpath('//*[@id="imgCode"]/@src')[0]
respone_content = session.get(url=new_url,headers=headers).content
with open('验证码图片.png', 'wb') as fp:
fp.write(respone_content)
#ddddocr开始使用,这个库用法很多,具体看下作者的github:https://github.com/sml2h3/ddddocr
ocr = ddddocr.DdddOcr()
with open("./验证码图片.png", 'rb') as f:
image = f.read()
res = ocr.classification(image)
print(res)
with open('文字.txt', 'w') as fp:
fp.write(res)
new_tree = etree.HTML(respone_text)
viestate = new_tree.xpath('// *[ @ id = "__VIEWSTATE"]/@value')[0]
login_url = 'https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx'
data = {
'__VIEWSTATE': viestate,
'__VIEWSTATEGENERATOR': 'C93BE1AE',
'from': 'http: //so.gushiwen.cn/user/collect.aspx',
'email': '1434463715@qq.com',
'pwd': 'h18894898963',
'code': res,
'denglu': '登录'
}
login_response = session.post(url=login_url,headers=headers,data=data)
login_response_test = login_response.text
with open('go.html','w',encoding='utf-8') as fp:
fp.write(login_response_test)
print(login_response.status_code)