python3爬虫学习系列06 -表单交互

最新推荐文章于 2023-06-27 09:21:15 发布

Idea King

最新推荐文章于 2023-06-27 09:21:15 发布

阅读量244

点赞数 1

分类专栏： python3 爬虫

本文链接：https://blog.csdn.net/besmarterbestronger/article/details/98957872

版权

python3 同时被 2 个专栏收录

56 篇文章 3 订阅

订阅专栏

爬虫

13 篇文章 0 订阅

订阅专栏

文章目录

一、登陆表单
2. 使用Mechanize模块实现自动化表单处理
三、参考文献

之前的博客：
爬虫学习系列02-常见的下载和抽取网页的方法
爬虫学习系列03-下载缓存
爬虫学习系列04 - 并发下载
爬虫学习系列05 - 获取动态内容

本节，将与网页进行交互，根据用户输入返回对应的内容。

发送POST请求提交表单；
使用cookie登陆网站；
用于简化表单提交的高级模块Mechanize。

一、登陆表单

表单有几个重要的组成部分，分别是 < form > 标签的action、enctype和method属性。
其中若action="#"，则表示此表单的提交url与此页面的url相同。
enctype属性用于设置数据提交的编码。默认的编码类型为application/x-www-form-urlencoded，此时所有非字母数字的字符都需要转换为十六进制的ascii码，如果存在大量非字母数字的字符，那么这种编码的效率就会非常低。针对这种情况，需要使用multipart/form-data作为编码类型，使用这种编码类型不会对数据进行编码，而是使用MIME（Multipurpose Internet Mail Extensions，多用途互联网邮件扩展）协议将其作为多个部分进行发送，这和邮件的传输标准相同。

可以通过Python的urllib、requests等库实现自动化提交登陆表单。

# -*- coding: utf-8 -*-

import urllib
import glob
import sqlite3
import os
import http.cookiejar as cookielib
import json
import time
import lxml.html

# 登陆邮箱
LOGIN_EMAIL = 'example@webscraping.com'
# 登陆密码
LOGIN_PASSWORD = 'example'
# 登陆的url
LOGIN_URL = 'http://example.webscraping.com/user/login'


def login_basic():
    """fails because not using formkey
    会因为没有使用_formkey知名表单ID而失败，因为会重定向到登陆页面
    """
    data = {'email': LOGIN_EMAIL, 'password': LOGIN_PASSWORD}
    # 编码数据
    encoded_data = urllib.parse.urlencode(data)
    # 构造request
    request = urllib.request.Request(LOGIN_URL, encoded_data)
    # 发送请求，并获得响应response
    response = urllib.parse.urlopen(request)
    # 打印响应页面的url，是否是登陆的url，因为登陆失败会重定向到登陆页面
    print(response.geturl())


def login_formkey():
    """fails because not using cookies to match formkey
    会因为没有使用cookie而报错，因为cookie中的_formkey的值会与提交的登陆表单数据中的_formkey进行对比，是否一致。
    """
    html = urllib.parse.urlopen(LOGIN_URL).read()
    data = parse_form(html)
    data['email'] = LOGIN_EMAIL
    data['password'] = LOGIN_PASSWORD
    encoded_data = urllib.parse.urlencode(data)
    request = urllib.request.Request(LOGIN_URL, encoded_data)
    response = urllib.parse.urlopen(request)
    # 打印响应页面的url，是否是登陆的url，因为登陆失败会重定向到登陆页面
    print(response.geturl())


def login_cookies():
    """working login
    正常登陆
    """
    # 声明一个cookieJar的对象来保存cookie
    cj = cookielib.CookieJar()
    # 使用HTTPCookieProcessor来创建cookie处理器
    # 构建opener
    opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
    # 获取到登陆网页的html
    html = opener.open(LOGIN_URL).read()
    # 解析网页html，添加上
    data = parse_form(html)
    data['email'] = LOGIN_EMAIL
    data['password'] = LOGIN_PASSWORD
    # 编码这些数据
    encoded_data = urllib.parse.urlencode(data)
    # 构造登陆请求
    request = urllib.request.Request(LOGIN_URL, encoded_data)
    # 发送登陆请求，并接收响应
    response = opener.open(request)
    # 打印响应页面的url，是否是登陆的url，因为登陆失败会重定向到登陆页面
    print(response.geturl())
    return opener


def login_firefox():
    """load cookies from firefox
    从火狐浏览器加载cookie
    从文件中加载cookie    
    """
    session_filename = find_ff_sessions()
    cj = load_ff_sessions(session_filename)
    opener = urllib2.build_opener(urllib.request.HTTPCookieProcessor(cj))
    html = opener.open(COUNTRY_URL).read()

    tree = lxml.html.fromstring(html)
    print(tree.cssselect('ul#navbar li a')[0].text_content())
    return opener


def parse_form(html):
    """extract all input properties from the form
    从表单中提取出所有的input标签属性
    """
    tree = lxml.html.fromstring(html)
    data = {}
    for e in tree.cssselect('form input'):
        if e.get('name'):
            data[e.get('name')] = e.get('value')
    return data



def load_ff_sessions(session_filename):
    cj = cookielib.CookieJar()
    if os.path.exists(session_filename):  
        try: 
            json_data = json.loads(open(session_filename, 'rb').read())
        except ValueError as e:
            print 'Error parsing session JSON:', str(e)
        else:
            for window in json_data.get('windows', []):
                for cookie in window.get('cookies', []):
                    import pprint; pprint.pprint(cookie)
                    c = cookielib.Cookie(0, cookie.get('name', ''), cookie.get('value', ''), 
                        None, False, 
                        cookie.get('host', ''), cookie.get('host', '').startswith('.'), cookie.get('host', '').startswith('.'), 
                        cookie.get('path', ''), False,
                        False, str(int(time.time()) + 3600 * 24 * 7), False, 
                        None, None, {})
                    cj.set_cookie(c)
    else:
        print('Session filename does not exist:', session_filename)
    return cj


def find_ff_sessions():
    paths = [
        '~/.mozilla/firefox/*.default',
        '~/Library/Application Support/Firefox/Profiles/*.default',
        '%APPDATA%/Roaming/Mozilla/Firefox/Profiles/*.default'
    ]
    for path in paths:
        filename = os.path.join(path, 'sessionstore.js')
        matches = glob.glob(os.path.expanduser(filename))
        if matches:
            return matches[0]


def main():
    login_cookies()


if __name__ == '__main__':
    main()

关于cookie、session的理解建议仔细看这篇博客 python3下使用requests实现模拟用户登录 —— 基础篇（马蜂窝），讲的通俗易懂。

2. 使用Mechanize模块实现自动化表单处理

# -*- coding: utf-8 -*-

import urllib
import mechanize
import login

COUNTRY_URL = 'http://example.webscraping.com/edit/United-Kingdom-239'



def edit_country():
	"""
	方式一、修改网页中某些内容
	"""
    opener = login.login_cookies()
    country_html = opener.open(COUNTRY_URL).read()
    data = login.parse_form(country_html)
    import pprint; pprint.pprint(data)
    print('Population before: ' + data['population'])
    data['population'] = int(data['population']) + 1
    encoded_data = urllib.parse.urlencode(data)
    request = urllib.request.Request(COUNTRY_URL, encoded_data)
    response = opener.open(request)

    country_html = opener.open(COUNTRY_URL).read()
    data = login.parse_form(country_html)
    print('Population after:', data['population'])



def mechanize_edit():
    """
    方式二、使用Mechanize修改网页中某些内容
	"""
    # login
    br = mechanize.Browser()
    br.open(login.LOGIN_URL)
    br.select_form(nr=0)
    print(br.form)
    br['email'] = login.LOGIN_EMAIL
    br['password'] = login.LOGIN_PASSWORD
    response = br.submit()

    # edit country
    br.open(COUNTRY_URL)
    br.select_form(nr=0)
    print('Population before:', br['population'])
    br['population'] = str(int(br['population']) + 1)
    br.submit()

    # check population increased
    br.open(COUNTRY_URL)
    br.select_form(nr=0)
    print('Population after:', br['population'])


if __name__ == '__main__':
    edit_country()
    mechanize_edit()

三、参考文献

[1]《用python写web爬虫(web scraping with python)》

Idea King

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
python3爬虫学习系列06 -表单交互

文章目录一、登陆表单2. 使用Mechanize模块实现自动化表单处理三、参考文献之前的博客：爬虫学习系列02-常见的下载和抽取网页的方法爬虫学习系列03-下载缓存爬虫学习系列04 - 并发下载爬虫学习系列05 - 获取动态内容本节，将与网页进行交互，根据用户输入返回对应的内容。发送POST请求提交表单；使用cookie登陆网站；用于简化表单提交的高级模块Mechanize。...
复制链接

扫一扫