python爬虫（登录豆瓣并修改签名）

最新推荐文章于 2024-04-14 18:48:30 发布

trb331617

最新推荐文章于 2024-04-14 18:48:30 发布

阅读量1.1k

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/trb331617/article/details/72853853

版权

python 专栏收录该内容

19 篇文章 1 订阅

订阅专栏

代码：

  1 #coding: utf-8
  2 
  3 import requests
  4 from HTMLParser import HTMLParser
  5 
  6 
  7 class DoubanClient(object):
  8     def __init__(self):
  9         object.__init__(self)
 10 
 11         myheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840
    .71 Safari/537.36','Origin': 'https://accounts.douban.com'}        #浏览器开发者：Request Headers
 12 
 13         self.session = requests.session()       #requests包 创建session
 14         self.session.headers.update(myheaders)  #将定制的header加入session
 15 
 16 
 17     def login(self, username, password,source='None',redir='https://www.douban.com/',login='登录'):
 18         #浏览器开发者：Form Data
 19 
 20         url = 'https://accounts.douban.com/login'       #网页URL
 21         r = self.session.get(url)       #用session访问该网页
 22         (captcha_id, captcha_url) = _get_captcha(r.content)     #调用get_captchar()解析网页中的内容，获取验证码的id和url
 23 
 24         #如果得到了验证码的id和url，提示用户打开url并输入其中的验证码
 25         if captcha_id:
 26             captcha_solution = raw_input('please input solution for captcha [%s]:' % captcha_url)
 27 
 28         url = 'https://accounts.douban.com/login'
 29         mydata = {'form_email': username,
 30                 'form_password': password,
 31                 'source': source,
 32                 'redir': redir,
 33                 'login': login}
 34         myheaders = {'referer': 'https://acocunts.douban.com/login',
 35                    'host': 'accounts.douban.com'}
 36         #浏览器开发者：Request Headers
 37 
 38         #将验证码的id和用户输入的验证码 加入post的data中
 39         if captcha_id:
 40             mydata['captcha-id'] = captcha_id
 41             mydata['captcha-solution'] = captcha_solution
 42 
 43         self.session.post(url, data=mydata, headers=myheaders)  #post发出请求
 44         print(self.session.cookies.items())
 45 
 46     #更改签名
 47     def edit_signature(self, username, signature):
 48         url = 'https://www.douban.com/people/%s/' % username    #网页URL
 49         r = self.session.get(url)       #用session访问该网页
 50         mydata = {'ck': _get_ck(r.content),
 51                 'signature': signature}
 52         myurl = 'https://www.douban.com/j/people/%s/edit_signature' % username
 53         myheaders = {'referer': url,
 54                    'host': 'www.douban.com',
 55                    'x-requested-with': 'XMLHttpRequest'}
 56         r = self.session.post(myurl, data=mydata, headers=myheaders)    #post
 57         print(r.content)
 58 
 59 
 60 def _attr(attrs, attrname):
 61     for attr in attrs:
 62         if attr[0] == attrname:
 63             return attr[1]
 64     return None
 65 
 66 
 67 def _get_captcha(content):
 68     #获取验证码的id和url        
 69     class CaptchaParser(HTMLParser):    #继承父类HTMLParser
 70         def __init__(self):
 71             HTMLParser.__init__(self)
 72             self.captcha_id = None      #默认值设为None
 73             self.captcha_url = None
 74 
 75         def handle_starttag(self, tag, attrs):
 76             if tag == 'img' and _attr(attrs, 'id') == 'captcha_image' and _attr(attrs, 'class') == 'captcha_image':
 77                 #根据网页框架进行条件限定，定位至验证码图片
 78                 self.captcha_url = _attr(attrs, 'src')  #得到验证码图片的url
 79 
 80             if tag == 'input' and _attr(attrs, 'type') == 'hidden' and _attr(attrs, 'name') == 'captcha-id':
 81                 #条件限定，定位至验证码id
 82                 self.captcha_id = _attr(attrs, 'value') #得到验证码的id value
 83 
 84     p = CaptchaParser()
 85     p.feed(content)     #feed()向解析器喂数据
 86     return p.captcha_id, p.captcha_url
 87 
 88 
 89 def _get_ck(content):
 90 
 91     class CKParser(HTMLParser):
 92         def __init__(self):
 93             HTMLParser.__init__(self)
 94             self.ck = None
 95 
 96         def handle_starttag(self, tag, attrs):
 97             if tag == 'input' and _attr(attrs, 'type') == 'hidden' and _attr(attrs, 'name') == 'ck':
 98                 #条件限定，定位至签名框
 99                 self.ck = _attr(attrs, 'value')
100 
101     p = CKParser()      #实例化类
102     p.feed(content)     #feed()向解析器喂数据
103     return p.ck
104 
105 
106 if __name__ == '__main__':
107     c = DoubanClient()
108     c.login('791368726@qq.com', '**此处为密码**')
109     c.edit_signature('162101126', '**此处为签名**')

执行：