爬虫

'''
from urllib.request  import urlopen
html = urlopen("http://pythonscraping.com/pages/page1.html")
print(html.read())'''

'''
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/page1.html")
bsObj = BeautifulSoup(html.read())
print(bsObj.h1)'''

'''
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bsObj = BeautifulSoup(html)

nameList = bsObj.findAll("span", {"class":"green"})
for name in nameList:
    print(name.get_text())
'''


'''
findAll(tag, attributes, recursive,                 text,           limit,          keywords)
        标签  标签属性   是否递归查询, 默认true,    查找指定文本   限制查找个数   关键字查找
                         表示查找所有层,false时只
                         查找第一层
                         
find(tag, attributes, recursive, text, keywords) 和findAll基本一致,就是limit默认是1,只查找一个

标签查找
.findAll({"h1","h2","h3","h4","h5","h6"})    查找所有标题标签

标签 + 属性
.findAll("span", {"class":{"green", "red"}}) 查找所有span标签 class属性是 green或red

text查找
nameList = bsObj.findAll(text="the prince")
print(len(nameList))

关键字查找
.findAll(id="text")    <===>   bsObj.findAll("", {"id":"text"}) 
通常我们不用关键字查找,而是用标签 + 属性 代替关键字


'''

'''
BeautifulSoup四种对象
1,BeautifulSoup 对象
前面代码示例中的 bsObj
2,标签 Tag 对象
BeautifulSoup 对象通过 find 和 findAll ,或者直接调用子标签获取的一列对象或单个
对象,就像:bsObj.div.h1
3,NavigableString 对象
用来表示标签里的文字,不是标签(有些函数可以操作和生成 NavigableString 对象,
而不是标签对象)。
4,Comment 对象
用来查找 HTML 文档的注释标签, <!-- 像这样 -->

'''

'''
虚拟的在线购物网站 http://www.pythonscraping.com/pages/page3.html 导航树

• html
    — body
        — div.wrapper
            — h1
            — div.content
            — table#giftList
                — tr
                    — th
                    — th
                    — th
                    — th
                — tr.gift#gift1
                    — td
                    — td
                        — span.excitingNote
                    — td
                    — td
                        — img
            — ……其他表格行省略了……
        — div.footer

'''

from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html)

# 1. 处理子标签和其他后代标签
# for child in bsObj.find("table",{"id":"giftList"}).children:
#     print(child)
#  .children 函数只找出子标签;  .descendants 函数找出所有后代标签

# 2. 处理兄弟标签
# BeautifulSoup 的 next_siblings() 函数可以让收集表格数据成为简单的事情,尤其是处理带标题行的表格
# for sibling in bsObj.find("table",{"id":"giftList"}).tr.next_siblings:
#     print(sibling)
# tr.next_siblings 找到tr后面的兄弟标签,不包括tr 同理,previous_siblings 找到前面的兄弟标签,与 next_siblings 和 previous_siblings
# 的作用类似,只是它们返回的是单个标签,而不是一组标签

# 3. 处理父级标签
# print(bsObj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text())
'''
 <tr>
    — <td>
    — <td>
    — <td>(3)
        — "$15.00" (4)
    — <td>(2)
        — <img src="../img/gifts/img1.jpg"> (1)
(1) 选择图片标签 src="../img/gifts/img1.jpg" ;
(2) 选择图片标签的父标签(在示例中是 <td> 标签);
(3) 选择 <td> 标签的前一个兄弟标签 previous_sibling (在示例中是包含美元价格的
<td>
标签);
(4) 选择标签中的文字,“$15.00”
'''

# 4,使用正则表达式匹配
import re
# images = bsObj.findAll("img",{"src":re.compile("\.\.\/img\/gifts/img.*\.jpg")})
# for image in images:
#     print(image["src"])
'''
这段代码会打印出图片的相对路径,都是以 ../img/gifts/img 开头,以 .jpg 结尾,其结果如
下所示:
../img/gifts/img1.jpg
../img/gifts/img2.jpg
../img/gifts/img3.jpg
../img/gifts/img4.jpg
../img/gifts/img6.jpg

正则表达式可以作为 BeautifulSoup 语句的任意一个参数,让你的目标元素查找工作极具灵活性。在本例中是匹配的img的地址,可以具体的匹配
'''

# 5,获取标签属性
'''
比如获取<a>标签的 href属性,获取<img>标签的esrc属性
对于一个标签对象,可以使用 .attrs获取全部属性,这返回的是一个字典对象,通过字典可以获取和操作这些属性。比如要获取图片的资源位置 src ,可以用下面这行代码:
myImgTag.attrs["src"]
'''

# 6,使用lambda匿名函数匹配标签
'''
soup.findAll(lambda tag: len(tag.attrs) == 2)  获取有两个属性的标签
<div class="body" id="content"></div>
<span style="color:red" class="title"></span>
'''

# 7,其他的HTML解析库
# lxml    HTMLparser

 

beautifulSoup模块的基本用法

    

单进程和并发爬取网站比较

import requests
import re
import time
import hashlib

def get_page(url):
    print('GET %s' %url)
    try:
        response=requests.get(url)
        if response.status_code == 200:
            return response.content
    except Exception:
        pass

def parse_index(res):
    obj=re.compile('class="items.*?<a href="(.*?)"',re.S)
    detail_urls=obj.findall(res.decode('gbk'))
    for detail_url in detail_urls:
        if not detail_url.startswith('http'):
            detail_url='http://www.xiaohuar.com'+detail_url
        yield detail_url

def parse_detail(res):
    obj=re.compile('id="media".*?src="(.*?)"',re.S)
    res=obj.findall(res.decode('gbk'))
    if len(res) > 0:
        movie_url=res[0]
        return movie_url


def save(movie_url):
    response=requests.get(movie_url,stream=False)
    if response.status_code == 200:
        m=hashlib.md5()
        m.update(('%s%s.mp4' %(movie_url,time.time())).encode('utf-8'))
        filename=m.hexdigest()
        with open(r'./movies/%s.mp4' %filename,'wb') as f:
            f.write(response.content)
            f.flush()


def main():
    index_url='http://www.xiaohuar.com/list-3-{0}.html'
    for i in range(5):
        print('*'*50,i)
        #爬取主页面
        index_page=get_page(index_url.format(i,))
        #解析主页面,拿到视频所在的地址列表
        detail_urls=parse_index(index_page)
        #循环爬取视频页
        for detail_url in detail_urls:
            #爬取视频页
            detail_page=get_page(detail_url)
            #拿到视频的url
            movie_url=parse_detail(detail_page)
            if movie_url:
                #保存视频
                save(movie_url)


if __name__ == '__main__':
    main()

'''
#并发爬取
from concurrent.futures import ThreadPoolExecutor
import queue
import requests
import re
import time
import hashlib
from threading import current_thread

p=ThreadPoolExecutor(50)

def get_page(url):
    print('%s GET %s' %(current_thread().getName(),url))
    try:
        response=requests.get(url)
        if response.status_code == 200:
            return response.content
    except Exception as e:
        print(e)

def parse_index(res):
    print('%s parse index ' %current_thread().getName())
    res=res.result()
    obj=re.compile('class="items.*?<a href="(.*?)"',re.S)
    detail_urls=obj.findall(res.decode('gbk'))
    for detail_url in detail_urls:
        if not detail_url.startswith('http'):
            detail_url='http://www.xiaohuar.com'+detail_url
        p.submit(get_page,detail_url).add_done_callback(parse_detail)

def parse_detail(res):
    print('%s parse detail ' %current_thread().getName())
    res=res.result()
    obj=re.compile('id="media".*?src="(.*?)"',re.S)
    res=obj.findall(res.decode('gbk'))
    if len(res) > 0:
        movie_url=res[0]
        print('MOVIE_URL: ',movie_url)
        with open('db.txt','a') as f:
            f.write('%s\n' %movie_url)
        # save(movie_url)
        p.submit(save,movie_url)
        print('%s下载任务已经提交' %movie_url)
def save(movie_url):
    print('%s SAVE: %s' %(current_thread().getName(),movie_url))
    try:
        response=requests.get(movie_url,stream=False)
        if response.status_code == 200:
            m=hashlib.md5()
            m.update(('%s%s.mp4' %(movie_url,time.time())).encode('utf-8'))
            filename=m.hexdigest()
            with open(r'./movies/%s.mp4' %filename,'wb') as f:
                f.write(response.content)
                f.flush()
    except Exception as e:
        print(e)

def main():
    index_url='http://www.xiaohuar.com/list-3-{0}.html'
    for i in range(5):
        p.submit(get_page,index_url.format(i,)).add_done_callback(parse_index)


if __name__ == '__main__':
    main()
'''

requests模块

'''
import requests
response=requests.get('https://www.baidu.com/s?wd=python&pn=1',
                      # wd表示搜索内容  pn=1表示只访问第一页
                      headers={
                        # 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36',
                        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
                      })
# 得到的response就是用浏览器访问时network里的response
print(response.text)
'''

import requests
import re
'''
#第一次请求
r1=requests.get('https://github.com/login')
# with open('r1.html','w',encoding='utf-8') as f:
#     f.write(r1.text)
r1_cookie=r1.cookies.get_dict() #拿到初始cookie(未被授权)
authenticity_token=re.findall(r'name="authenticity_token".*?value="(.*?)"',r1.text)[0] #从页面中拿到CSRF TOKEN
# print(r1_cookie,authenticity_token)
# {'logged_in': 'no', '_gh_sess': 'Q1ZiK1V5RjBmdTNoYVk0Y3FZRURIVmpMY1hXa3VGVTVFRmh1blN5QVltNGhmQnRFNkFvcmZpRzFST2ZxS2R1S0NZWUdEa0pZcFh2WGt3UE0xR3I4WmhLRUpnOW1aZzhoVVpDVU51UWZVbVJoZHo1UXBoZzVyY3Y2NE9pR1VqbGxZWVZxN3lZdEtqczA2M3BIaythOVlKVHRsdCtmV3hFT1B2WG0rRzRVcFY1Q2gyeGJVeVM5VGh3MllLVkdPdWxsVUpyL2dKeER1cWNxc1ppVjNTMGdwYUVoeE9UVjloa0FnYmIzdUhBbnNLNTZWQUx2WlR0VHYvM2J3VWdKZ2RrdjdHMklzODNFOTVNRTU0NTdMakEyMkV0UE5VY1VyOVBUeG14OUxidGdQOUZSQ2lITlVmVWxkSWpmOHc5Nm5sNWwtLXZBOVl0MmltcjMrQ2JVeGMzQjhVSlE9PQ%3D%3D--fc83b0b0c8be98b0d16b999f3b0dfe3386c47ad3', 'has_recent_activity': '1'} koXm+IFLTc/b5l+QNZBRL+R0sdFlvIY0bAMmof1TadJIUVCXIpmUtLTZQRHZKOFL0dPALof29ASylREm9vsqnw==


#第二次请求:带着初始cookie和TOKEN发送POST请求给登录页面,带上账号密码
data={
    'commit':'Sign in',
    'utf8':'✓',
    'authenticity_token':authenticity_token,
    'login':'317828332@qq.com',
    'password':'alex3714'
}
r2=requests.post('https://github.com/session',
             data=data,
             cookies=r1_cookie
             )


login_cookie=r2.cookies.get_dict()
print(login_cookie)
# {'logged_in': 'yes', '__Host-user_session_same_site': '-Twt5ZngjpZyILQLdG90CU-v7V_TAJEgKRXcWiAUBVeST-nO', '_gh_sess': 'eE9ZRFowWFVlTE9TUXhTTDE0U1BxbWU1WGw2M09vN0dzTGxiZkZhZXQ0Ui9xWEJMd0lJMlc0SHEzYldLMmJRZFpqbXZ1eXY0cWxaazdCeC9kUmhvS2JXdFBlY1h4Um1RZXJHYlVTWGxseldEc1J6OEord3lWUHhJVk5OQXRBMnFvTG13Q3BRd0ZlN3dFbFBTeGlYTXhwNS9FaGlwMldYYnMyQldhWVhwQk5xeTJWZG9URnI5QWF0OGliUzZ3bXRHTFVrWjR0cWZwWEtFQXJTeGtpT1dqbzR0Q0dIV3RDUjlwdmpmZG9zWHlDdz0tLTlKUHFCM0UwNWtNU1A4ZmNhS1hjZHc9PQ%3D%3D--28dc2b8141a93588ab52d166cc13c188c278f4d5', 'has_recent_activity': '1', 'user_session': '-Twt5ZngjpZyILQLdG90CU-v7V_TAJEgKRXcWiAUBVeST-nO'}
# 总结下,首先使用get请求的到初始cookie(未被授权)和CSRF TOKEN,然后使用post发送带着初始cookie和TOKEN发送POST请求给登录页面,带上账号密码

#第三次请求:以后的登录,拿着login_cookie就可以,比如访问一些个人配置
login_cookie = {'logged_in': 'yes', '__Host-user_session_same_site': 'Q3_oQ9DLwIZ5Z4vCrlrEtw4PSd8E8WLN7ftycrLINDmyLU3Y', '_gh_sess': 'UE9ZTVdFRHpCS2sreEpVRFNKUTR1NnE2VGVRMkU2bzZUa3p5YkpzQTFaS0wxTHBZSlFsaUpyeVlhdWNoZy9qTGkzaW9DNGNvbzMzdStnY3JzQXFlUElpT3RVSkk4RFAza2Zac3ZWWTdDL0hCY0p4aU9RbnBNMERqMXFUNEk1OUNUeVhCZTljTS9IeUxsK3FISEhrZjJHL0kvVVhjV2hkVzBPd292SmUxVEx1L2hRWHlGSDVPUTdXU2ZhNzRRSzBHdUpkREdOVjlSRy94Q0FBVUFjY05wOGQ5MEdUMkEyOHQ1RDNRUnRCeDNFYz0tLU5FR013M3IvY3d2UTlRbDFoamk2VGc9PQ%3D%3D--4b83733a566892557a9ba486333a49c1b72d13f3', 'has_recent_activity': '1', 'user_session': 'Q3_oQ9DLwIZ5Z4vCrlrEtw4PSd8E8WLN7ftycrLINDmyLU3Y'}

r3=requests.get('https://github.com/settings/emails',
                cookies=login_cookie)
with open('r3.html','w',encoding='utf-8') as f:
    f.write(r3.text)
print('317828332@qq.com' in r3.text) #True
'''

import requests
respone=requests.get('http://www.baidu.com')
# respone属性
print('respone.text',respone.text)
print('*'*50)
print('content',respone.content)
print('*'*50)
print('status_code',respone.status_code)
print('*'*50)
print('headers',respone.headers)
print('*'*50)
print('cookies',respone.cookies)
print('*'*50)
print('get_dict',respone.cookies.get_dict())
print('*'*50)
print('items',respone.cookies.items())
print('*'*50)
print('url',respone.url)
print('*'*50)
print('history',respone.history)
print('*'*50)
print('encoding',respone.encoding)
'''
respone.text <!DOCTYPE html>
<!--STATUS OK--><html> <head><meta http-equiv=content-type content=text/html;charset=utf-8><meta http-equiv=X-UA-Compatible content=IE=Edge><meta content=always name=referrer><link rel=stylesheet type=text/css href=http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css><title>ç¾åº¦ä¸ä¸ï¼ä½ å°±ç¥é</title></head> <body link=#0000cc> <div id=wrapper> <div id=head> <div class=head_wrapper> <div class=s_form> <div class=s_form_wrapper> <div id=lg> <img hidefocus=true src=//www.baidu.com/img/bd_logo1.png width=270 height=129> </div> <form id=form name=f action=//www.baidu.com/s class=fm> <input type=hidden name=bdorz_come value=1> <input type=hidden name=ie value=utf-8> <input type=hidden name=f value=8> <input type=hidden name=rsv_bp value=1> <input type=hidden name=rsv_idx value=1> <input type=hidden name=tn value=baidu><span class="bg s_ipt_wr"><input id=kw name=wd class=s_ipt value maxlength=255 autocomplete=off autofocus></span><span class="bg s_btn_wr"><input type=submit id=su value=ç¾åº¦ä¸ä¸ class="bg s_btn"></span> </form> </div> </div> <div id=u1> <a href=http://news.baidu.com name=tj_trnews class=mnav>æ°é»</a> <a href=http://www.hao123.com name=tj_trhao123 class=mnav>hao123</a> <a href=http://map.baidu.com name=tj_trmap class=mnav>å°å¾</a> <a href=http://v.baidu.com name=tj_trvideo class=mnav>è§é¢</a> <a href=http://tieba.baidu.com name=tj_trtieba class=mnav>è´´å§</a> <noscript> <a href=http://www.baidu.com/bdorz/login.gif?login&amp;tpl=mn&amp;u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1 name=tj_login class=lb>ç»å½</a> </noscript> <script>document.write('<a href="http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u='+ encodeURIComponent(window.location.href+ (window.location.search === "" ? "?" : "&")+ "bdorz_come=1")+ '" name="tj_login" class="lb">ç»å½</a>');</script> <a href=//www.baidu.com/more/ name=tj_briicon class=bri style="display: block;">æ´å¤äº§å</a> </div> </div> </div> <div id=ftCon> <div id=ftConw> <p id=lh> <a href=http://home.baidu.com>å³äºç¾åº¦</a> <a href=http://ir.baidu.com>About Baidu</a> </p> <p id=cp>&copy;2017&nbsp;Baidu&nbsp;<a href=http://www.baidu.com/duty/>使ç¨ç¾åº¦åå¿è¯»</a>&nbsp; <a href=http://jianyi.baidu.com/ class=cp-feedback>æè§åé¦</a>&nbsp;京ICPè¯030173å·&nbsp; <img src=//www.baidu.com/img/gs.gif> </p> </div> </div> </div> </body> </html>

**************************************************
content b'<!DOCTYPE html>\r\n<!--STATUS OK--><html> <head><meta http-equiv=content-type content=text/html;charset=utf-8><meta http-equiv=X-UA-Compatible content=IE=Edge><meta content=always name=referrer><link rel=stylesheet type=text/css href=http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css><title>\xe7\x99\xbe\xe5\xba\xa6\xe4\xb8\x80\xe4\xb8\x8b\xef\xbc\x8c\xe4\xbd\xa0\xe5\xb0\xb1\xe7\x9f\xa5\xe9\x81\x93</title></head> <body link=#0000cc> <div id=wrapper> <div id=head> <div class=head_wrapper> <div class=s_form> <div class=s_form_wrapper> <div id=lg> <img hidefocus=true src=//www.baidu.com/img/bd_logo1.png width=270 height=129> </div> <form id=form name=f action=//www.baidu.com/s class=fm> <input type=hidden name=bdorz_come value=1> <input type=hidden name=ie value=utf-8> <input type=hidden name=f value=8> <input type=hidden name=rsv_bp value=1> <input type=hidden name=rsv_idx value=1> <input type=hidden name=tn value=baidu><span class="bg s_ipt_wr"><input id=kw name=wd class=s_ipt value maxlength=255 autocomplete=off autofocus></span><span class="bg s_btn_wr"><input type=submit id=su value=\xe7\x99\xbe\xe5\xba\xa6\xe4\xb8\x80\xe4\xb8\x8b class="bg s_btn"></span> </form> </div> </div> <div id=u1> <a href=http://news.baidu.com name=tj_trnews class=mnav>\xe6\x96\xb0\xe9\x97\xbb</a> <a href=http://www.hao123.com name=tj_trhao123 class=mnav>hao123</a> <a href=http://map.baidu.com name=tj_trmap class=mnav>\xe5\x9c\xb0\xe5\x9b\xbe</a> <a href=http://v.baidu.com name=tj_trvideo class=mnav>\xe8\xa7\x86\xe9\xa2\x91</a> <a href=http://tieba.baidu.com name=tj_trtieba class=mnav>\xe8\xb4\xb4\xe5\x90\xa7</a> <noscript> <a href=http://www.baidu.com/bdorz/login.gif?login&amp;tpl=mn&amp;u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1 name=tj_login class=lb>\xe7\x99\xbb\xe5\xbd\x95</a> </noscript> <script>document.write(\'<a href="http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u=\'+ encodeURIComponent(window.location.href+ (window.location.search === "" ? "?" : "&")+ "bdorz_come=1")+ \'" name="tj_login" class="lb">\xe7\x99\xbb\xe5\xbd\x95</a>\');</script> <a href=//www.baidu.com/more/ name=tj_briicon class=bri style="display: block;">\xe6\x9b\xb4\xe5\xa4\x9a\xe4\xba\xa7\xe5\x93\x81</a> </div> </div> </div> <div id=ftCon> <div id=ftConw> <p id=lh> <a href=http://home.baidu.com>\xe5\x85\xb3\xe4\xba\x8e\xe7\x99\xbe\xe5\xba\xa6</a> <a href=http://ir.baidu.com>About Baidu</a> </p> <p id=cp>&copy;2017&nbsp;Baidu&nbsp;<a href=http://www.baidu.com/duty/>\xe4\xbd\xbf\xe7\x94\xa8\xe7\x99\xbe\xe5\xba\xa6\xe5\x89\x8d\xe5\xbf\x85\xe8\xaf\xbb</a>&nbsp; <a href=http://jianyi.baidu.com/ class=cp-feedback>\xe6\x84\x8f\xe8\xa7\x81\xe5\x8f\x8d\xe9\xa6\x88</a>&nbsp;\xe4\xba\xacICP\xe8\xaf\x81030173\xe5\x8f\xb7&nbsp; <img src=//www.baidu.com/img/gs.gif> </p> </div> </div> </div> </body> </html>\r\n'
**************************************************
status_code 200
**************************************************
headers {'Cache-Control': 'private, no-cache, no-store, proxy-revalidate, no-transform', 'Connection': 'Keep-Alive', 'Content-Encoding': 'gzip', 'Content-Type': 'text/html', 'Date': 'Sun, 26 Aug 2018 09:02:09 GMT', 'Last-Modified': 'Mon, 23 Jan 2017 13:27:36 GMT', 'Pragma': 'no-cache', 'Server': 'bfe/1.0.8.18', 'Set-Cookie': 'BDORZ=27315; max-age=86400; domain=.baidu.com; path=/', 'Transfer-Encoding': 'chunked'}
**************************************************
cookies <RequestsCookieJar[<Cookie BDORZ=27315 for .baidu.com/>]>
**************************************************
get_dict {'BDORZ': '27315'}
**************************************************
items [('BDORZ', '27315')]
**************************************************
url http://www.baidu.com/
**************************************************
history []
**************************************************
encoding ISO-8859-1

'''
#关闭:response.close()
# from contextlib import closing
# with closing(requests.get('xxx',stream=True)) as response:
#     for line in response.iter_content():
#         pass
'''
编码问题
import requests
response=requests.get('http://www.autohome.com/news')
response.encoding='gbk' #汽车之家网站返回的页面内容为gb2312编码的,而requests的默认编码为ISO-8859-1,如果不设置成gbk则中文乱码
print(response.text)
------------------------------------------------------------------------------------------------------
import requests

response=requests.get('https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1509868306530&di=712e4ef3ab258b36e9f4b48e85a81c9d&imgtype=0&src=http%3A%2F%2Fc.hiphotos.baidu.com%2Fimage%2Fpic%2Fitem%2F11385343fbf2b211e1fb58a1c08065380dd78e0c.jpg')

with open('a.jpg','wb') as f:
    f.write(response.content)
-----------------------------------------------------------------------------------------------------
#stream参数:一点一点的取,比如下载视频时,如果视频100G,用response.content然后一下子写到文件中是不合理的

import requests

response=requests.get('https://gss3.baidu.com/6LZ0ej3k1Qd3ote6lo7D0j9wehsv/tieba-smallvideo-transcode/1767502_56ec685f9c7ec542eeaf6eac93a65dc7_6fe25cd1347c_3.mp4',
                      stream=True)

with open('b.mp4','wb') as f:
    for line in response.iter_content():
        f.write(line)

# 获取二进制流
-----------------------------------------------------------------------------------------------------
#解析json
import requests
response=requests.get('http://httpbin.org/get')

import json
res1=json.loads(response.text) #太麻烦

res2=response.json() #直接获取json数据


print(res1 == res2) #True

-----------------------------------------------------------------------------------------------
Redirection and History
import requests
import re

#第一次请求
r1=requests.get('https://github.com/login')
r1_cookie=r1.cookies.get_dict() #拿到初始cookie(未被授权)
authenticity_token=re.findall(r'name="authenticity_token".*?value="(.*?)"',r1.text)[0] #从页面中拿到CSRF TOKEN

#第二次请求:带着初始cookie和TOKEN发送POST请求给登录页面,带上账号密码
data={
    'commit':'Sign in',
    'utf8':'✓',
    'authenticity_token':authenticity_token,
    'login':'317828332@qq.com',
    'password':'alex3714'
}


#测试一:没有指定allow_redirects=False,则响应头中出现Location就跳转到新页面,r2代表新页面的response
r2=requests.post('https://github.com/session',
             data=data,
             cookies=r1_cookie
             )

print(r2.status_code) #200
print(r2.url) #看到的是跳转后的页面
print(r2.history) #看到的是跳转前的response
print(r2.history[0].text) #看到的是跳转前的response.text


#测试二:指定allow_redirects=False,则响应头中即便出现Location也不会跳转到新页面,r2代表的仍然是老页面的response
r2=requests.post('https://github.com/session',
             data=data,
             cookies=r1_cookie,
             allow_redirects=False
             )


print(r2.status_code) #302
print(r2.url) #看到的是跳转前的页面https://github.com/session
print(r2.history) #[]

利用github登录后跳转到主页面的例子来验证它


-------------------------------------------------------------------------------------------------------------

SSL Cert Verification

#证书验证(大部分网站都是https)
import requests
respone=requests.get('https://www.12306.cn') #如果是ssl请求,首先检查证书是否合法,不合法则报错,程序终端


#改进1:去掉报错,但是会报警告
import requests
respone=requests.get('https://www.12306.cn',verify=False) #不验证证书,报警告,返回200
print(respone.status_code)


#改进2:去掉报错,并且去掉警报信息
import requests
from requests.packages import urllib3
urllib3.disable_warnings() #关闭警告
respone=requests.get('https://www.12306.cn',verify=False)
print(respone.status_code)

#改进3:加上证书
#很多网站都是https,但是不用证书也可以访问,大多数情况都是可以携带也可以不携带证书
#知乎\百度等都是可带可不带
#有硬性要求的,则必须带,比如对于定向的用户,拿到证书后才有权限访问某个特定网站
import requests
respone=requests.get('https://www.12306.cn',
                     cert=('/path/server.crt',
                           '/path/key'))
print(respone.status_code)

----------------------------------------------------------------------------------------------------
代理设置
#官网链接: http://docs.python-requests.org/en/master/user/advanced/#proxies

#代理设置:先发送请求给代理,然后由代理帮忙发送(封ip是常见的事情)
import requests
proxies={
    'http':'http://egon:123@localhost:9743',#带用户名密码的代理,@符号前是用户名与密码
    'http':'http://localhost:9743',
    'https':'https://localhost:9743',
}
respone=requests.get('https://www.12306.cn',
                     proxies=proxies)

print(respone.status_code)



#支持socks代理,安装:pip install requests[socks]
import requests
proxies = {
    'http': 'socks5://user:pass@host:port',
    'https': 'socks5://user:pass@host:port'
}
respone=requests.get('https://www.12306.cn',
                     proxies=proxies)

print(respone.status_code)
-------------------------------------------------------------------------------------------
超时设置

#两种超时:float or tuple
#timeout=0.1 #代表接收数据的超时时间
#timeout=(0.1,0.2)#0.1代表链接超时  0.2代表接收数据的超时时间

import requests
respone=requests.get('https://www.baidu.com',
                     timeout=0.0001)
#                      测试两秒就不超时了

-------------------------------------------------------------------------------------------------
认证设置
#官网链接:http://docs.python-requests.org/en/master/user/authentication/

#认证设置:登陆网站是,弹出一个框,要求你输入用户名密码(与alter很类似),此时是无法获取html的
# 但本质原理是拼接成请求头发送
#         r.headers['Authorization'] = _basic_auth_str(self.username, self.password)
# 一般的网站都不用默认的加密方式,都是自己写
# 那么我们就需要按照网站的加密方式,自己写一个类似于_basic_auth_str的方法
# 得到加密字符串后添加到请求头
#         r.headers['Authorization'] =func('.....')

#看一看默认的加密方式吧,通常网站都不会用默认的加密设置
import requests
from requests.auth import HTTPBasicAuth
r=requests.get('xxx',auth=HTTPBasicAuth('user','password'))
print(r.status_code)

#HTTPBasicAuth可以简写为如下格式
import requests
r=requests.get('xxx',auth=('user','password'))
print(r.status_code)
--------------------------------------------------------------------------------------
#异常处理
import requests
from requests.exceptions import * #可以查看requests.exceptions获取异常类型

try:
    r=requests.get('http://www.baidu.com',timeout=0.00001)
except ReadTimeout:
    print('===:')
# except ConnectionError: #网络不通
#     print('-----')
# except Timeout:
#     print('aaaaa')

except RequestException:
    print('Error')
------------------------------------------------------------------------------------------
上传文件
import requests
files={'file':open('a.jpg','rb')}
respone=requests.post('http://httpbin.org/post',files=files)
print(respone.status_code)
'''

 

 

selenium  驱动浏览器 
selenium最初是一个自动化测试工具,而爬虫中使用它主要是为了解决requests无法直接执行JavaScript代码的问题

selenium本质是通过驱动浏览器,完全模拟浏览器的操作,比如跳转、输入、点击、下拉等,来拿到网页渲染之后的结果,可支持多种浏览器
# """
# #安装:selenium+chromedriver  有界面浏览器
# pip3 install selenium
# 下载chromdriver.exe放到python安装路径的scripts目录中即可,注意最新版本是2.38,并非2.9
# 国内镜像网站地址:http://npm.taobao.org/mirrors/chromedriver/2.38/
# 最新的版本去官网找:https://sites.google.com/a/chromium.org/chromedriver/downloads
#
# #验证安装
# C:\Users\Administrator>python3
# Python 3.6.1 (v3.6.1:69c0db5, Mar 21 2017, 18:41:36) [MSC v.1900 64 bit (AMD64)] on win32
# Type "help", "copyright", "credits" or "license" for more information.
# >>> from selenium import webdriver
# >>> driver=webdriver.Chrome() #弹出浏览器
# >>> driver.get('https://www.baidu.com')
# >>> driver.page_source
#
# #注意:
# selenium3默认支持的webdriver是Firfox,而Firefox需要安装geckodriver
# 下载链接:https://github.com/mozilla/geckodriver/releases
#
#
#
#
#
#
# # selenium+谷歌浏览器headless模式  无界面浏览器
#
# #selenium:3.12.0
# #webdriver:2.38
# #chrome.exe: 65.0.3325.181(正式版本) (32 位)
#
# from selenium import webdriver
# from selenium.webdriver.chrome.options import Options
# chrome_options = Options()
# chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率
# chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug
# chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面
# chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
# chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
# chrome_options.binary_location = r"C:\Users\afly\AppData\Local\Google\Chrome\Application\chrome.exe" #手动指定使用的浏览器位置
#
#
# driver=webdriver.Chrome(chrome_options=chrome_options)
# driver.get('https://www.baidu.com')
# # print(driver.page_source)
#
# print('hao123' in driver.page_source)
#
#
# driver.close() #切记关闭浏览器,回收资源
# """
'''
基本使用
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素

browser=webdriver.Chrome()
try:
    browser.get('https://www.baidu.com')

    input_tag=browser.find_element_by_id('kw')
    input_tag.send_keys('美女') #python2中输入中文错误,字符串前加个u
    input_tag.send_keys(Keys.ENTER) #输入回车

    wait=WebDriverWait(browser,10)
    wait.until(EC.presence_of_element_located((By.ID,'content_left'))) #等到id为content_left的元素加载完毕,最多等10秒

    print(browser.page_source)
    print(browser.current_url)
    print(browser.get_cookies())

finally:
    browser.close()
'''
"""
选择器 基本用法  下面的例子是登录百度账号
#官网链接:http://selenium-python.readthedocs.io/locating-elements.html
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
import time

driver=webdriver.Chrome()
driver.get('https://www.baidu.com')
wait=WebDriverWait(driver,10)

try:
    #===============所有方法===================
    # 1、find_element_by_id
    # 2、find_element_by_link_text
    # 3、find_element_by_partial_link_text
    # 4、find_element_by_tag_name
    # 5、find_element_by_class_name
    # 6、find_element_by_name
    # 7、find_element_by_css_selector
    # 8、find_element_by_xpath
    # 强调:
    # 1、上述均可以改写成find_element(By.ID,'kw')的形式
    # 2、find_elements_by_xxx的形式是查找到多个元素,结果为列表

    #===============示范用法===================
    # 1、find_element_by_id
    print('driver.find_element_by_id(kw)',driver.find_element_by_id('kw'))
    # driver.find_element_by_id(kw)   <selenium.webdriver.remote.webelement.WebElement (session="9c700746c88efecdf2433feea3ccd4b3", element="0.8758959721944797-1")>

    # 2、find_element_by_link_text
    # login=driver.find_element_by_link_text('登录')
    # login.click()

    # 3、find_element_by_partial_link_text
    login=driver.find_elements_by_partial_link_text('录')[0]
    login.click()

    # 4、find_element_by_tag_name
    print('driver.find_element_by_tag_name(a)',driver.find_element_by_tag_name('a'))
    # driver.find_element_by_tag_name(a)  <selenium.webdriver.remote.webelement.WebElement (session="9c700746c88efecdf2433feea3ccd4b3", element="0.8758959721944797-3")>

    # 5、find_element_by_class_name
    button=wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'tang-pass-footerBarULogin')))  # tang-pass-footerBarULogin这个是用户名登录这个p标签的一个class
    button.click()

    # 6、find_element_by_name
    input_user=wait.until(EC.presence_of_element_located((By.NAME,'userName'))) # 这个是input输入框,input的属性name='userName'
    input_pwd=wait.until(EC.presence_of_element_located((By.NAME,'password')))  # 密码 input输入框 name
    commit=wait.until(EC.element_to_be_clickable((By.ID,'TANGRAM__PSP_10__submit'))) # 登录按钮id

    input_user.send_keys('18611453110')
    input_pwd.send_keys('xxxxxx')
    commit.click()

    # 7、find_element_by_css_selector
    driver.find_element_by_css_selector('#kw')

    # 8、find_element_by_xpath

    time.sleep(5)

finally:
    driver.close()
"""
'''
二. xpath
# 官网链接:http://selenium-python.readthedocs.io/locating-elements.html
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By  # 按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys  # 键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
import time

driver = webdriver.PhantomJS()  # 这里使用的是PhantomJS,我还没安装
driver.get('https://doc.scrapy.org/en/latest/_static/selectors-sample1.html')
# wait=WebDriverWait(driver,3)
driver.implicitly_wait(3)  # 使用隐式等待

try:
    # find_element_by_xpath
    # //与/
    # driver.find_element_by_xpath('//body/a')  # 开头的//代表从整篇文档中寻找,body之后的/代表body的儿子,这一行找不到就会报错了

    ret1 = driver.find_element_by_xpath('//body//a')  # 开头的//代表从整篇文档中寻找,body之后的//代表body的子子孙孙
    ret2 = driver.find_element_by_css_selector('body a')
    print(ret1)
    print('*'*50)
    print(ret2)

    # 取第n个
    res1 = driver.find_elements_by_xpath('//body//a[1]')  # 取第一个a标签
    print(res1[0].text)

    # 按照属性查找,下述三者查找效果一样
    res1 = driver.find_element_by_xpath('//a[5]')
    res2 = driver.find_element_by_xpath('//a[@href="image5.html"]')
    res3 = driver.find_element_by_xpath('//a[contains(@href,"image5")]')  # 模糊查找
    print('==>', res1.text)
    print('==>', res2.text)
    print('==>', res3.text)

    # 其他
    res1 = driver.find_element_by_xpath('/html/body/div/a')
    print(res1.text)

    res2 = driver.find_element_by_xpath('//a[img/@src="image3_thumb.jpg"]')  # 找到子标签img的src属性为image3_thumb.jpg的a标签
    print(res2.tag_name, res2.text)

    res3 = driver.find_element_by_xpath("//input[@name='continue'][@type='button']")  # 查看属性name为continue且属性type为button的input标签
    res4 = driver.find_element_by_xpath("//*[@name='continue'][@type='button']")  # 查看属性name为continue且属性type为button的所有标签

    time.sleep(5)

finally:
    driver.close()
'''
"""
# 获取标签属性
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素

browser=webdriver.Chrome()

browser.get('https://www.amazon.cn/')

wait=WebDriverWait(browser,10)
wait.until(EC.presence_of_element_located((By.ID,'cc-lm-tcgShowImgContainer')))

tag=browser.find_element(By.CSS_SELECTOR,'#cc-lm-tcgShowImgContainer img')

#获取标签属性,
print(tag.get_attribute('src'))


#获取标签ID,位置,名称,大小(了解)
print(tag.id)
print(tag.location)
print(tag.tag_name)
print(tag.size)


browser.close()
"""
"""
# 等待元素被加载
# #1、selenium只是模拟浏览器的行为,而浏览器解析页面是需要时间的(执行css,js),一些元素可能需要过一段时间才能加载出来,为了保证能查找到元素,必须等待
# #2、等待的方式分两种:
# 隐式等待:在browser.get('xxx')前就设置,针对所有元素有效
# 显式等待:在browser.get('xxx')之后设置,只针对某个元素有效


例子:
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素

browser=webdriver.Chrome()

#隐式等待:在查找所有元素时,如果尚未被加载,则等10秒
browser.implicitly_wait(10)

browser.get('https://www.baidu.com')


input_tag=browser.find_element_by_id('kw')
input_tag.send_keys('美女')
input_tag.send_keys(Keys.ENTER)

contents=browser.find_element_by_id('content_left') #没有等待环节而直接查找,找不到则会报错
print(contents)
browser.close()


from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素

browser=webdriver.Chrome()
browser.get('https://www.baidu.com')


input_tag=browser.find_element_by_id('kw')
input_tag.send_keys('美女')
input_tag.send_keys(Keys.ENTER)


#显式等待:显式地等待某个元素被加载
wait=WebDriverWait(browser,10)
wait.until(EC.presence_of_element_located((By.ID,'content_left')))

contents=browser.find_element(By.CSS_SELECTOR,'#content_left')
print(contents)
browser.close()
"""
"""
六 .元素交互操作
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素

browser=webdriver.Chrome()
browser.get('https://www.amazon.cn/')
wait=WebDriverWait(browser,10)


input_tag=wait.until(EC.presence_of_element_located((By.ID,'twotabsearchtextbox')))
input_tag.send_keys('iphone 8')
button=browser.find_element_by_css_selector('#nav-search > form > div.nav-right > div > input')
button.click()


import time
time.sleep(3)

input_tag=browser.find_element_by_id('twotabsearchtextbox')
input_tag.clear() #清空输入框
input_tag.send_keys('iphone7plus')
button=browser.find_element_by_css_selector('#nav-search > form > div.nav-right > div > input')
button.click()

browser.close()
"""

"""
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By  # 按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys  # 键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
import time

driver = webdriver.Chrome()
driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
wait=WebDriverWait(driver,3)
# driver.implicitly_wait(3)  # 使用隐式等待

try:
    driver.switch_to.frame('iframeResult') ##切换到iframeResult
    sourse=driver.find_element_by_id('draggable')
    target=driver.find_element_by_id('droppable')

    #方式一:基于同一个动作链串行执行
    actions=ActionChains(driver) #拿到动作链对象
    actions.drag_and_drop(sourse,target) #把动作放到动作链中,准备串行执行
    actions.perform()

    #方式二:不同的动作链,每次移动的位移都不同
    '''
    ActionChains(driver).click_and_hold(sourse).perform()
    distance=target.location['x']-sourse.location['x']

    track=0
    while track < distance:
        ActionChains(driver).move_by_offset(xoffset=2,yoffset=0).perform()
        track+=2
        time.sleep(0.5)

    ActionChains(driver).release().perform()
'''
    time.sleep(10)


finally:
    driver.close()

"""

"""
# 在交互动作比较难实现的时候可以自己写JS(万能方法)
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素


try:
    browser=webdriver.Chrome()
    browser.get('https://www.baidu.com')
    browser.execute_script('alert("hello world")') #打印警告  execute_script()这个函数可以执行javascript命令
finally:
    pass
    # browser.close()
"""
"""
# 补充:frame的切换
#frame相当于一个单独的网页,在父frame里是无法直接查看到子frame的元素的,必须switch_to_frame切到该frame下,才能进一步查找
# 在本例中访问的网页使用了frame

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素


try:
    browser=webdriver.Chrome()
    browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')

    browser.switch_to.frame('iframeResult') #切换到id为iframeResult的frame

    tag1=browser.find_element_by_id('droppable')
    print(tag1)

    # tag2=browser.find_element_by_id('textareaCode') #报错,在子frame里无法查看到父frame的元素
    browser.switch_to.parent_frame() #切回父frame,就可以查找到了
    tag2=browser.find_element_by_id('textareaCode')  # 这是那个代码的输入框
    print(tag2)

finally:
    browser.close()
"""

 

 

"""
#模拟浏览器的前进后退
import time
from selenium import webdriver

browser=webdriver.Chrome()
browser.get('https://www.baidu.com')
browser.get('https://www.taobao.com')
browser.get('http://www.sina.com.cn/')

browser.back()
time.sleep(10)
browser.forward()
browser.close()
"""


"""
#cookies
from selenium import webdriver

browser=webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')
print(browser.get_cookies())
browser.add_cookie({'k1':'xxx','k2':'yyy'})
print(browser.get_cookies())

# browser.delete_all_cookies()
"""


"""
#选项卡管理(多个网页窗口的管理):切换选项卡,有js的方式windows.open,有windows快捷键:ctrl+t等,最通用的就是js的方式
import time
from selenium import webdriver

browser=webdriver.Chrome()
browser.get('https://www.baidu.com')
browser.execute_script('window.open()')

print(browser.window_handles) #获取所有的选项卡
# 横线表示可以运行,但是pyharm不建议使用这种方法,有更好的方法替代  使用 switch_to.window  代替 switch_to_window
# 参见博客 https://blog.csdn.net/ccggaag/article/details/76652274
browser.switch_to_window(browser.window_handles[1])
browser.get('https://www.taobao.com')
time.sleep(10)
browser.switch_to_window(browser.window_handles[0])
browser.get('https://www.sina.com.cn')
browser.close()
"""


"""
# 异常处理
from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException

try:
    browser=webdriver.Chrome()
    browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
    browser.switch_to.frame('iframssseResult')

except TimeoutException as e:
    print(e)
except NoSuchFrameException as e:
    print(e)
finally:
    browser.close()
"""

"""
# 自动登录163邮箱并发送邮件
#注意:网站都策略都是在不断变化的,精髓在于学习流程。下述代码生效与2017-11-7,不能保证永久有效
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

browser=webdriver.Chrome()

try:
    browser.get('http://mail.163.com/')

    wait=WebDriverWait(browser,5)

    frame=wait.until(EC.presence_of_element_located((By.ID,'x-URS-iframe')))
    browser.switch_to.frame(frame)

    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-container')))

    inp_user=browser.find_element_by_name('email')
    inp_pwd=browser.find_element_by_name('password')
    button=browser.find_element_by_id('dologin')
    inp_user.send_keys('18611453110')
    inp_pwd.send_keys('xxxx')
    button.click()

    #如果遇到验证码,可以把下面一小段打开注释
    # import time
    # time.sleep(10)
    # button = browser.find_element_by_id('dologin')
    # button.click()

    wait.until(EC.presence_of_element_located((By.ID,'dvNavTop')))
    write_msg=browser.find_elements_by_css_selector('#dvNavTop li')[1] #获取第二个li标签就是“写信”了
    write_msg.click()


    wait.until(EC.presence_of_element_located((By.CLASS_NAME,'tH0')))
    recv_man=browser.find_element_by_class_name('nui-editableAddr-ipt')
    title=browser.find_element_by_css_selector('.dG0 .nui-ipt-input')
    recv_man.send_keys('378533872@qq.com')
    title.send_keys('圣旨')
    print(title.tag_name)


    frame=wait.until(EC.presence_of_element_located((By.CLASS_NAME,'APP-editor-iframe')))
    browser.switch_to.frame(frame)
    body=browser.find_element(By.CSS_SELECTOR,'body')
    body.send_keys('egon很帅,可以加工资了')

    browser.switch_to.parent_frame() #切回他爹
    send_button=browser.find_element_by_class_name('nui-toolbar-item')
    send_button.click()

    #可以睡时间久一点别让浏览器关掉,看看发送成功没有
    import time
    time.sleep(10000)

except Exception as e:
    print(e)
finally:
    browser.close()
"""

"""
# 爬取京东商城商品信息
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
import time


def get_goods(driver):
    try:
        goods=driver.find_elements_by_class_name('gl-item')

        for good in goods:
            detail_url=good.find_element_by_tag_name('a').get_attribute('href')

            p_name=good.find_element_by_css_selector('.p-name em').text.replace('\n','')
            price=good.find_element_by_css_selector('.p-price i').text
            p_commit=good.find_element_by_css_selector('.p-commit a').text

            msg = '''
            商品 : %s
            链接 : %s
            价钱 :%s
            评论 :%s
            ''' % (p_name,detail_url,price,p_commit)

            print(msg,end='\n\n')


        button=driver.find_element_by_partial_link_text('下一页')
        button.click()
        time.sleep(1)
        get_goods(driver)
    except Exception:
        pass

def spider(url,keyword):
    driver = webdriver.Chrome()
    driver.get(url)
    driver.implicitly_wait(3)  # 使用隐式等待
    try:
        input_tag=driver.find_element_by_id('key')
        input_tag.send_keys(keyword)
        input_tag.send_keys(Keys.ENTER)
        get_goods(driver)
    finally:
        driver.close()


if __name__ == '__main__':
    spider('https://www.jd.com/',keyword='iPhone8手机')
"""

 beautiful soup 基本用法

#
# html文本
# html_doc = """ # <html><head><title>The Dormouse's story</title></head> # <body> # <p class="title"><b>The Dormouse's story</b></p> # # <p class="story">Once upon a time there were three little sisters; and their names were # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; # and they lived at the bottom of a well.</p> # # <p class="story">...</p> # """ # # #基本使用:容错处理,文档的容错能力指的是在html代码不完整的情况下,使用该模块可以识别该错误。使用BeautifulSoup解析上述代码,能够得到一个 BeautifulSoup 的对象,并能按照标准的缩进格式的结构输出 # from bs4 import BeautifulSoup # soup=BeautifulSoup(html_doc,'lxml') #具有容错功能 # res=soup.prettify() #处理好缩进,结构化显示 # print(res) #遍历文档树:即直接通过标签名字选择,特点是选择速度快,但如果存在多个相同的标签则只返回第一个 html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p id="my p" class="title"><b id="bbb" class="boldest">The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # #1、用法 from bs4 import BeautifulSoup # soup=BeautifulSoup(html_doc,'lxml') # # soup=BeautifulSoup(open('a.html'),'lxml') # # print(soup.p) #存在多个相同的标签则只返回第一个 <p class="title" id="my p"><b class="boldest" id="bbb">The Dormouse's story</b></p> # print(soup.a) #存在多个相同的标签则只返回第一个 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> # # #2、获取标签的名称 # print(soup.p.name) # p # # #3、获取标签的属性 # print(soup.p.attrs) #{'id': 'my p', 'class': ['title']} # # #4、获取标签的内容 # print(soup.p.string,111) # p下的文本只有一个时,取到,否则为None # print(soup.p.strings,222) #拿到一个生成器对象, 取到p下所有的文本内容 # print(soup.p.text,333) #取到p下所有的文本内容 # for line in soup.stripped_strings: #去掉空白 # print(line,444) ''' The Dormouse's story 111 <generator object _all_strings at 0x00000186C64B5F68> 222 The Dormouse's story 333 The Dormouse's story 444 The Dormouse's story 444 Once upon a time there were three little sisters; and their names were 444 Elsie 444 , 444 Lacie 444 and 444 Tillie 444 ; and they lived at the bottom of a well. 444 ... 444 ''' # ''' # 如果tag包含了多个子节点,tag就无法确定 .string 方法应该调用哪个子节点的内容, .string 的输出结果是 None,\ # 如果只有一个子节点那么就输出该子节点的文本,比如下面的这种结构,soup.p.string 返回为None,但soup.p.strings就可以找到所有文本 html_doc2 = '''<p id='list-1'> 哈哈哈哈 <a class='sss'> <span> <h1>aaaa</h1> </span> </a> <b>bbbbb</b> </p> ''' # soup=BeautifulSoup(html_doc2,'lxml') # print(soup.p.string) # print(soup.p.strings) # for line in soup.p.strings: # print(line) # # #5、嵌套选择 # print(soup.head.title.string) # print(soup.body.a.string) # # # #6、子节点、子孙节点 # print(soup.p.contents) #p下所有子节点 # print(soup.p.children) #得到一个迭代器,包含p下所有子节点 # # for i,child in enumerate(soup.p.children): # print(i,child) # # print(soup.p.descendants) #获取子孙节点,p下所有的标签都会选择出来 # for i,child in enumerate(soup.p.descendants): # print(i,child) # # #7、父节点、祖先节点 # print(soup.a.parent) #获取a标签的父节点 # print(soup.a.parents) #找到a标签所有的祖先节点,父亲的父亲,父亲的父亲的父亲... # # # #8、兄弟节点 # print('=====>') # print(soup.a.next_sibling) #下一个兄弟 # print(soup.a.previous_sibling) #上一个兄弟 # # print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象 # print(soup.a.previous_siblings) #上面的兄弟们=>生成器对象 ---------------------------------------------------------------------------------------------------------------------------------------------------- #搜索文档树:BeautifulSoup定义了很多搜索方法,这里着重介绍2个: find() 和 find_all() .其它方法的参数和用法类似 html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p id="my p" class="title"><b id="bbb" class="boldest">The Dormouse's story</b> </p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ ''' from bs4 import BeautifulSoup soup=BeautifulSoup(html_doc,'lxml') #1、五种过滤器: 字符串、正则表达式、列表、True、方法 #1.1、字符串:即标签名 print(soup.find_all('b')) #1.2、正则表达式 import re print(soup.find_all(re.compile('^b'))) #找出b开头的标签,结果有body和b标签 #1.3、列表:如果传入列表参数,Beautiful Soup会将与列表中任一元素匹配的内容返回.下面代码找到文档中所有<a>标签和<b>标签: print(soup.find_all(['a','b'])) #1.4、True:可以匹配任何值,下面代码查找到所有的tag,但是不会返回字符串节点 print(soup.find_all(True)) for tag in soup.find_all(True): print(tag.name) #1.5、方法:如果没有合适过滤器,那么还可以定义一个方法,方法只接受一个元素参数 ,如果这个方法返回 True 表示当前元素匹配并且被找到,如果不是则反回 False def has_class_but_no_id(tag): return tag.has_attr('class') and not tag.has_attr('id') print(soup.find_all(has_class_but_no_id)) ''' ---------------------------------------------------------------------------------------------------------------------------------------------- ''' find_all() 的使用 from bs4 import BeautifulSoup import re soup=BeautifulSoup(html_doc,'lxml') #2、find_all( name , attrs , recursive , text , **kwargs ) #2.1、name: 搜索name参数的值可以使任一类型的 过滤器 ,字符窜,正则表达式,列表,方法或是 True . print(soup.find_all(name=re.compile('^t'))) #2.2、keyword: key=value的形式,value可以是过滤器:字符串 , 正则表达式 , 列表, True . print(soup.find_all(id=re.compile('my'))) print(soup.find_all(href=re.compile('lacie'),id=re.compile('\d'))) #注意类要用class_ print(soup.find_all(id=True)) #查找有id属性的标签 # 有些tag属性在搜索不能使用,比如HTML5中的 data-* 属性: data_soup = BeautifulSoup('<div data-foo="value">foo!</div>','lxml') # data_soup.find_all(data-foo="value") #报错:SyntaxError: keyword can't be an expression # 但是可以通过 find_all() 方法的 attrs 参数定义一个字典参数来搜索包含特殊属性的tag: print(data_soup.find_all(attrs={"data-foo": "value"})) # [<div data-foo="value">foo!</div>] #2.3、按照类名查找,注意关键字是class_,class_=value,value可以是五种选择器之一 print(soup.find_all('a',class_='sister')) #查找类为sister的a标签 print(soup.find_all('a',class_='sister ssss')) #查找类为sister和sss的a标签,顺序错误也匹配不成功 print(soup.find_all(class_=re.compile('^sis'))) #查找类为sister的所有标签 #2.4、attrs print(soup.find_all('p',attrs={'class':'story'})) #2.5、text: 值可以是:字符,列表,True,正则 print(soup.find_all(text='Elsie')) print(soup.find_all('a',text='Elsie')) #2.6、limit参数:如果文档树很大那么搜索会很慢.如果我们不需要全部结果,可以使用 limit 参数限制返回结果的数量.效果与SQL中的limit关键字类似,当搜索到的结果数量达到 limit 的限制时,就停止搜索返回结果 print(soup.find_all('a',limit=2)) #2.7、recursive:调用tag的 find_all() 方法时,Beautiful Soup会检索当前tag的所有子孙节点,如果只想搜索tag的直接子节点,可以使用参数 recursive=False . print(soup.html.find_all('a')) print(soup.html.find_all('a',recursive=False)) ''' ''' 像调用 find_all() 一样调用tag find_all() 几乎是Beautiful Soup中最常用的搜索方法,所以我们定义了它的简写方法. BeautifulSoup 对象和 tag 对象可以被当作一个方法来使用,这个方法的执行结果与调用这个对象的 find_all() 方法相同,下面两行代码是等价的: soup.find_all("a") soup("a") 这两行代码也是等价的: soup.title.find_all(text=True) soup.title(text=True) ''' ''' find() 方法 #3、find( name , attrs , recursive , text , **kwargs ) find_all() 方法将返回文档中符合条件的所有tag,尽管有时候我们只想得到一个结果.比如文档中只有一个<body>标签,那么使用 find_all() 方法来查找<body>标签就不太合适, 使用 find_all 方法并设置 limit=1 参数不如直接使用 find() 方法.下面两行代码是等价的: soup.find_all('title', limit=1) # [<title>The Dormouse's story</title>] soup.find('title') # <title>The Dormouse's story</title> 唯一的区别是 find_all() 方法的返回结果是值包含一个元素的列表,而 find() 方法直接返回结果. find_all() 方法没有找到目标是返回空列表, find() 方法找不到目标时,返回 None . print(soup.find("nosuchtag")) # None soup.head.title 是 tag的名字 方法的简写.这个简写的原理就是多次调用当前tag的 find() 方法:链式编程? soup.head.title # <title>The Dormouse's story</title> soup.find("head").find("title") # <title>The Dormouse's story</title> ''' # beautiful soup官网 # https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html#find-parents-find-parent #该模块提供了select方法来支持css,详见官网:https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html#id37 html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"> <b>The Dormouse's story</b> Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"> <span>Elsie</span> </a> <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; <div class='panel-1'> <ul class='list' id='list-1'> <li class='element'>Foo</li> <li class='element'>Bar</li> <li class='element'>Jay</li> </ul> <ul class='list list-small' id='list-2'> <li class='element'><h1 class='yyyy'>Foo</h1></li> <li class='element xxx'>Bar</li> <li class='element'>Jay</li> </ul> </div> and they lived at the bottom of a well. </p> <p class="story">...</p> """ ''' from bs4 import BeautifulSoup soup=BeautifulSoup(html_doc,'lxml') #1、CSS选择器 print('.sister',soup.p.select('.sister')) print('.sister span',soup.select('.sister span')) print('#link1',soup.select('#link1')) print('#link1 span',soup.select('#link1 span')) print('#list-2 .element.xxx',soup.select('#list-2 .element.xxx')) print('#list-2','.element',soup.select('#list-2')[0].select('.element')) #可以一直select,但其实没必要,一条select就可以了 # 2、获取属性 print('#list-2 h1',soup.select('#list-2 h1')[0].attrs) # 3、获取内容 print('#list-2 h1',soup.select('#list-2 h1')[0].get_text()) '''

 

爬虫使用并发提高效率

1.同步调用,不使用并发

# 同步调用
import requests

def parse_page(res):
    print('解析 %s' %(len(res)))

def get_page(url):
    print('下载 %s' %url)
    response=requests.get(url)
    if response.status_code == 200:
        return response.text

urls=['https://www.baidu.com/','http://www.sina.com.cn/','https://www.python.org']
for url in urls:
    res=get_page(url) #调用一个任务,就在原地等待任务结束拿到结果后才继续往后执行
    parse_page(res)

2.使用多进程(线程)

#IO密集型程序应该用多线程
# 多进程或多线程
import requests
from threading import Thread,current_thread

def parse_page(res):
    print('%s 解析 %s' %(current_thread().getName(),len(res))) # current_thread().getName()获取当前线程的名字

def get_page(url,callback=parse_page):
    print('%s 下载 %s' %(current_thread().getName(),url))
    response=requests.get(url)
    if response.status_code == 200:
        callback(response.text)

if __name__ == '__main__':
    urls=['https://www.baidu.com/','http://www.sina.com.cn/','https://www.python.org']
    for url in urls:
        t=Thread(target=get_page,args=(url,))
        t.start()
# Thread-1 下载 https://www.baidu.com/
# Thread-2 下载 http://www.sina.com.cn/
# Thread-3 下载 https://www.python.org
# Thread-1 解析 2443
# Thread-2 解析 570061
# Thread-3 解析 48823

3.使用进程池(线程池)

#IO密集型程序应该用多线程,所以此时我们使用线程池
# 进程池或线程池:异步调用+回调机制
import requests
from threading import current_thread
from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor

def parse_page(res):
    res=res.result()    # 这个result()是线程池的回调函数中的参数是个对象需要使用result来得到线程的返回值
    print('%s 解析 %s' %(current_thread().getName(),len(res)))

def get_page(url):
    print('%s 下载 %s' %(current_thread().getName(),url))
    response=requests.get(url)
    if response.status_code == 200:
        return response.text

if __name__ == '__main__':
    urls=['https://www.baidu.com/','http://www.sina.com.cn/','https://www.python.org']

    pool=ThreadPoolExecutor(50)
    # pool=ProcessPoolExecutor(50)
    for url in urls:
        pool.submit(get_page,url).add_done_callback(parse_page)

    pool.shutdown(wait=True)

4.使用asyncio模块,可以帮我们检测IO(只能是网络IO),实现应用程序级别的切换

# asycio基本使用,可以帮助我们检测网络IO
import asyncio

@asyncio.coroutine
def task(task_id,senconds):
    print('%s is start' %task_id)
    yield from asyncio.sleep(senconds) #只能检测网络IO,检测到IO后切换到其他任务执行
    print('%s is end' %task_id)

tasks=[task(task_id="任务1",senconds=3),task("任务2",2),task(task_id="任务3",senconds=1)]

loop=asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
loop.close()

# 任务2 is start  不知道为什么总是tasks列表中中间的那个先执行,调换了也是中间的先执行
# 任务1 is start
# 任务3 is start
# 任务3 is end
# 任务2 is end
# 任务1 is end

5.asyncio模块只能发tcp级别的请求,不能发http协议,因此,在我们需要发送http请求的时候,需要我们自定义http报头

# asyncio模块只能发tcp级别的请求,不能发http协议,因此,在我们需要发送http请求的时候,需要我们自定义http报头
# asyncio+自定义http协议报头
import asyncio
import requests
import uuid
user_agent='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'

def parse_page(host,res):
    print('%s 解析结果 %s' %(host,len(res)))
    with open('%s.html' %(uuid.uuid1()),'wb') as f:
        # uuid.uuid1()  基于MAC地址,时间戳,随机数来生成唯一的uuid,可以保证全球范围内的唯一性。详见博客:https://www.cnblogs.com/franknihao/p/7307224.html
        f.write(res)

@asyncio.coroutine
def get_page(host,port=80,url='/',callback=parse_page,ssl=False):   # 'www.baidu.com',url='/s?wd=美女',ssl=True
    print('下载 http://%s:%s%s' %(host,port,url))

    #步骤一(IO阻塞):发起tcp链接,是阻塞操作,因此需要yield from
    if ssl:
        port=443
    recv,send=yield from asyncio.open_connection(host=host,port=443,ssl=ssl)
    # 什么是yield from呢?  详见博客:https://blog.csdn.net/qq_27825451/article/details/78847473
    #
    #      简单地说,yield from  generator 。实际上就是返回另外一个生成器的值。如下所示:
    #
    #
    # def generator2():
    # yield 'a'
    # yield 'b'
    # yield 'c'
    # yield from generator1() #yield from iterable本质上等于 for item in iterable: yield item的缩写版
    # yield from [11,22,33,44]
    # yield from (12,23,34)
    # yield from range(3)

    # 步骤二:封装http协议的报头,因为asyncio模块只能封装并发送tcp包,因此这一步需要我们自己封装http协议的包
    request_headers="""GET %s HTTP/1.0\r\nHost: %s\r\nUser-agent: %s\r\n\r\n""" %(url,host,user_agent)  # 请求头
    # requset_headers="""POST %s HTTP/1.0\r\nHost: %s\r\n\r\nname=egon&password=123""" % (url, host,)
    request_headers=request_headers.encode('utf-8')

    # 步骤三(IO阻塞):发送http请求包
    send.write(request_headers)
    yield from send.drain()

    # 步骤四(IO阻塞):接收响应头
    while True:
        line=yield from recv.readline()
        if line == b'\r\n':
            break
        print('%s Response headers:%s' %(host,line))

    # 步骤五(IO阻塞):接收响应体
    text=yield from recv.read()

    # 步骤六:执行回调函数
    callback(host,text)

    # 步骤七:关闭套接字
    send.close() #没有recv.close()方法,因为是四次挥手断链接,双向链接的两端,一端发完数据后执行send.close()另外一端就被动地断开


if __name__ == '__main__':
    tasks=[
        get_page('www.baidu.com',url='/s?wd=美女',ssl=True),
        get_page('www.cnblogs.com',url='/',ssl=True),
    ]

    loop=asyncio.get_event_loop()
    loop.run_until_complete(asyncio.wait(tasks))
    loop.close()

6.使用aiohttp模块,专门帮我们封装http报头,然后我们还需要用asyncio检测IO实现切换

# 这段代码报错了,不知道是为什么
# asyncio+aiohttp
import aiohttp
import asyncio

@asyncio.coroutine
def get_page(url):
    print('GET:%s' %url)
    response=yield from aiohttp.request('GET',url)

    data=yield from response.read()

    print(url,data)
    response.close()
    return 1

tasks=[
    get_page('https://www.python.org/doc'),
    get_page('https://www.cnblogs.com/linhaifeng'),
    get_page('https://www.openstack.org')
]

loop=asyncio.get_event_loop()
results=loop.run_until_complete(asyncio.gather(*tasks))
loop.close()

print('=====>',results) #[1, 1, 1]

7.在上面的基础上将requests.get函数传给asyncio,就能够被检测了

# asyncio+requests模块的方法
import requests
import asyncio

@asyncio.coroutine
def get_page(func,*args):   # requests.get,'https://www.python.org/doc'
    print('GET:%s' %args[0])
    loog=asyncio.get_event_loop()
    furture=loop.run_in_executor(None,func,*args)
    response=yield from furture

    print(response.url,len(response.text))
    return 1

tasks=[
    get_page(requests.get,'https://www.python.org/doc'),
    get_page(requests.get,'https://www.cnblogs.com/linhaifeng'),
    get_page(requests.get,'https://www.openstack.org')
]

loop=asyncio.get_event_loop()
results=loop.run_until_complete(asyncio.gather(*tasks))
loop.close()

print('=====>',results) #[1, 1, 1]

8.gevent+requests使用协程

# gevent+requests
from gevent import monkey;monkey.patch_all()
import gevent
import requests

def get_page(url):
    print('GET:%s' %url)
    response=requests.get(url)
    print(url,len(response.text))
    return 1

# g1=gevent.spawn(get_page,'https://www.python.org/doc')
# g2=gevent.spawn(get_page,'https://www.cnblogs.com/linhaifeng')
# g3=gevent.spawn(get_page,'https://www.openstack.org')
# gevent.joinall([g1,g2,g3,])
# print(g1.value,g2.value,g3.value) #拿到返回值


#协程池
from gevent.pool import Pool
pool=Pool(2)
g1=pool.spawn(get_page,'https://www.python.org/doc')
g2=pool.spawn(get_page,'https://www.cnblogs.com/linhaifeng')
g3=pool.spawn(get_page,'https://www.openstack.org')
gevent.joinall([g1,g2,g3,])
print(g1.value,g2.value,g3.value) #拿到返回值

 9.使用封装了gevent+requests模块的grequests模块

import grequests

request_list=[
    grequests.get('https://wwww.xxxx.org/doc1'),
    grequests.get('https://www.cnblogs.com/linhaifeng'),
    grequests.get('https://www.openstack.org')
]


##### 执行并获取响应列表 #####
# response_list = grequests.map(request_list)
# print(response_list)

##### 执行并获取响应列表(处理异常) #####
def exception_handler(request, exception):
    # print(request,exception)
    print("%s Request failed" %request.url)

response_list = grequests.map(request_list, exception_handler=exception_handler)
print(response_list,111)

# https://wwww.xxxx.org/doc1 Request failed
# [None, <Response [200]>, <Response [200]>] 111  reponse_list是个列表,元素是响应对象

10.twisted:是一个网络框架,其中一个功能是发送异步请求,检测IO并自动切换

'''
#问题一:error: Microsoft Visual C++ 14.0 is required. Get it with "Microsoft Visual C++ Build Tools": http://landinghub.visualstudio.com/visual-cpp-build-tools https://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted pip3 install C:\Users\Administrator\Downloads\Twisted-17.9.0-cp36-cp36m-win_amd64.whl pip3 install twisted #问题二:ModuleNotFoundError: No module named 'win32api' https://sourceforge.net/projects/pywin32/files/pywin32/ #问题三:openssl pip3 install pyopenssl ''' #twisted基本用法 from twisted.web.client import getPage,defer from twisted.internet import reactor def all_done(arg): # print(arg) reactor.stop() def callback(res): print(res) return 1 defer_list=[] urls=[ 'http://www.baidu.com', 'http://www.bing.com', 'https://www.python.org', ] for url in urls: obj=getPage(url.encode('utf=-8'),) obj.addCallback(callback) defer_list.append(obj) defer.DeferredList(defer_list).addBoth(all_done) reactor.run() #twisted的getPage的详细用法 from twisted.internet import reactor from twisted.web.client import getPage import urllib.parse def one_done(arg): print(arg) reactor.stop() post_data = urllib.parse.urlencode({'check_data': 'adf'}) post_data = bytes(post_data, encoding='utf8') headers = {b'Content-Type': b'application/x-www-form-urlencoded'} response = getPage(bytes('http://dig.chouti.com/login', encoding='utf8'), method=bytes('POST', encoding='utf8'), postdata=post_data, cookies={}, headers=headers) response.addBoth(one_done) reactor.run()

11.tornado

from tornado.httpclient import AsyncHTTPClient
from tornado.httpclient import HTTPRequest
from tornado import ioloop


def handle_response(response):
    """
    处理返回值内容(需要维护计数器,来停止IO循环),调用 ioloop.IOLoop.current().stop()
    :param response:
    :return:
    """
    if response.error:
        print("Error:", response.error)
    else:
        print(response.body)


def func():
    url_list = [
        'http://www.baidu.com',
        'http://www.bing.com',
    ]
    for url in url_list:
        print(url)
        http_client = AsyncHTTPClient()
        http_client.fetch(HTTPRequest(url), handle_response)


ioloop.IOLoop.current().add_callback(func)
ioloop.IOLoop.current().start()




#发现上例在所有任务都完毕后也不能正常结束,为了解决该问题,让我们来加上计数器
from tornado.httpclient import AsyncHTTPClient
from tornado.httpclient import HTTPRequest
from tornado import ioloop

count=0

def handle_response(response):
    """
    处理返回值内容(需要维护计数器,来停止IO循环),调用 ioloop.IOLoop.current().stop()
    :param response:
    :return:
    """
    if response.error:
        print("Error:", response.error)
    else:
        print(len(response.body))

    global count
    count-=1 #完成一次回调,计数减1
    if count == 0:
        ioloop.IOLoop.current().stop() 

def func():
    url_list = [
        'http://www.baidu.com',
        'http://www.bing.com',
    ]

    global count
    for url in url_list:
        print(url)
        http_client = AsyncHTTPClient()
        http_client.fetch(HTTPRequest(url), handle_response)
        count+=1 #计数加1

ioloop.IOLoop.current().add_callback(func)
ioloop.IOLoop.current().start()


 

转载于:https://www.cnblogs.com/perfey/p/9493468.html

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值