模拟提交表单、抓取动态数据、识别验证码
模拟提交表单
import requests
from bs4 import BeautifulSoup
def main():
resp = requests.get('https://github.com/login')
if resp.status_code != 200:
return
cookies = resp.cookies.get_dict()
print(cookies)
soup = BeautifulSoup(resp.text, 'lxml')
urf8_value = \
soup.select_one('form input[name=utf8]').attrs['value']
authenticity_token = \
soup.select_one('form input[name=authenticity_token]').attrs['value']
print(urf8_value)
print(authenticity_token)
data = {
'utf8': urf8_value,
'authenticity_token': authenticity_token,
'login': '',
'password': ''
}
resps = requests.post('https://github.com/session', data=data, cookies=cookies)
print(resps.text)
if __name__ == '__main__':
main()
使用robobrowser
import robobrowser
import requests
def main():
b = robobrowser.RoboBrowser(history=True, parser='lxml')
b.open(('https://github.com/login'))
f = b.get_form(action='/session')
f['login'].value = ''
f['password'].value = ''
resps = b.submit_form(f)
print(resps)
for a_tag in b.select('a[href]'):
print(a_tag.attrs['href'])
if __name__ == '__main__':
main()
抓取动态数据
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
def main():
driver = webdriver.Chrome()
driver.get('https://v.taobao.com/v/content/live?catetype=704&from=taonvlang')
elem = driver.find_element_by_css_selector('input[placeholder=输入关键词搜索]')
elem.send_keys('运动')
elem.send_keys(Keys.ENTER)
soup = BeautifulSoup(driver.page_source, 'lxml')
for img_tag in soup.body.select('img[src]'):
print(img_tag.attrs['src'])
if __name__ == '__main__':
main()
识别验证码
from io import BytesIO
from PIL import Image, ImageFilter
from pytesseract import image_to_string
import requests
import base64
def main():
img = Image.open(open('鸣人仙人模式1.jpg', 'rb'))
img1 = Image.open(open('鸣人仙人模式1.jpg', 'rb'))
img3 = img1.point(lambda x:0 if x < 128 else 255)
img3.save(open('heheda.jpg', 'wb'))
base64.b64encode(BytesIO(img3))
resp = requests.get('https://pin2.aliyun.com/get_img?type=150_40&identity=mailsso.mxhichina.
com&sessionid=k0xHyBxU3K3dGXb59mP9cdeTXxL9gLHSTKhRZCryHxpOoyk4lAVuJhgw==')
img4 = Image.open(BytesIO(resp.content))
img4.save('hello.jpg')
print(image_to_string(img4))
print(base64.b64encode(resp.content))
if __name__ == '__main__':
main()