''' from urllib.request import urlopen html = urlopen("http://pythonscraping.com/pages/page1.html") print(html.read())''' ''' from urllib.request import urlopen from bs4 import BeautifulSoup html = urlopen("http://www.pythonscraping.com/pages/page1.html") bsObj = BeautifulSoup(html.read()) print(bsObj.h1)''' ''' from urllib.request import urlopen from bs4 import BeautifulSoup html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html") bsObj = BeautifulSoup(html) nameList = bsObj.findAll("span", {"class":"green"}) for name in nameList: print(name.get_text()) ''' ''' findAll(tag, attributes, recursive, text, limit, keywords) 标签 标签属性 是否递归查询, 默认true, 查找指定文本 限制查找个数 关键字查找 表示查找所有层,false时只 查找第一层 find(tag, attributes, recursive, text, keywords) 和findAll基本一致,就是limit默认是1,只查找一个 标签查找 .findAll({"h1","h2","h3","h4","h5","h6"}) 查找所有标题标签 标签 + 属性 .findAll("span", {"class":{"green", "red"}}) 查找所有span标签 class属性是 green或red text查找 nameList = bsObj.findAll(text="the prince") print(len(nameList)) 关键字查找 .findAll(id="text") <===> bsObj.findAll("", {"id":"text"}) 通常我们不用关键字查找,而是用标签 + 属性 代替关键字 ''' ''' BeautifulSoup四种对象 1,BeautifulSoup 对象 前面代码示例中的 bsObj 2,标签 Tag 对象 BeautifulSoup 对象通过 find 和 findAll ,或者直接调用子标签获取的一列对象或单个 对象,就像:bsObj.div.h1 3,NavigableString 对象 用来表示标签里的文字,不是标签(有些函数可以操作和生成 NavigableString 对象, 而不是标签对象)。 4,Comment 对象 用来查找 HTML 文档的注释标签, <!-- 像这样 --> ''' ''' 虚拟的在线购物网站 http://www.pythonscraping.com/pages/page3.html 导航树 • html — body — div.wrapper — h1 — div.content — table#giftList — tr — th — th — th — th — tr.gift#gift1 — td — td — span.excitingNote — td — td — img — ……其他表格行省略了…… — div.footer ''' from urllib.request import urlopen from bs4 import BeautifulSoup html = urlopen("http://www.pythonscraping.com/pages/page3.html") bsObj = BeautifulSoup(html) # 1. 处理子标签和其他后代标签 # for child in bsObj.find("table",{"id":"giftList"}).children: # print(child) # .children 函数只找出子标签; .descendants 函数找出所有后代标签 # 2. 处理兄弟标签 # BeautifulSoup 的 next_siblings() 函数可以让收集表格数据成为简单的事情,尤其是处理带标题行的表格 # for sibling in bsObj.find("table",{"id":"giftList"}).tr.next_siblings: # print(sibling) # tr.next_siblings 找到tr后面的兄弟标签,不包括tr 同理,previous_siblings 找到前面的兄弟标签,与 next_siblings 和 previous_siblings # 的作用类似,只是它们返回的是单个标签,而不是一组标签 # 3. 处理父级标签 # print(bsObj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text()) ''' <tr> — <td> — <td> — <td>(3) — "$15.00" (4) — <td>(2) — <img src="../img/gifts/img1.jpg"> (1) (1) 选择图片标签 src="../img/gifts/img1.jpg" ; (2) 选择图片标签的父标签(在示例中是 <td> 标签); (3) 选择 <td> 标签的前一个兄弟标签 previous_sibling (在示例中是包含美元价格的 <td> 标签); (4) 选择标签中的文字,“$15.00” ''' # 4,使用正则表达式匹配 import re # images = bsObj.findAll("img",{"src":re.compile("\.\.\/img\/gifts/img.*\.jpg")}) # for image in images: # print(image["src"]) ''' 这段代码会打印出图片的相对路径,都是以 ../img/gifts/img 开头,以 .jpg 结尾,其结果如 下所示: ../img/gifts/img1.jpg ../img/gifts/img2.jpg ../img/gifts/img3.jpg ../img/gifts/img4.jpg ../img/gifts/img6.jpg 正则表达式可以作为 BeautifulSoup 语句的任意一个参数,让你的目标元素查找工作极具灵活性。在本例中是匹配的img的地址,可以具体的匹配 ''' # 5,获取标签属性 ''' 比如获取<a>标签的 href属性,获取<img>标签的esrc属性 对于一个标签对象,可以使用 .attrs获取全部属性,这返回的是一个字典对象,通过字典可以获取和操作这些属性。比如要获取图片的资源位置 src ,可以用下面这行代码: myImgTag.attrs["src"] ''' # 6,使用lambda匿名函数匹配标签 ''' soup.findAll(lambda tag: len(tag.attrs) == 2) 获取有两个属性的标签 <div class="body" id="content"></div> <span style="color:red" class="title"></span> ''' # 7,其他的HTML解析库 # lxml HTMLparser
beautifulSoup模块的基本用法
单进程和并发爬取网站比较
import requests import re import time import hashlib def get_page(url): print('GET %s' %url) try: response=requests.get(url) if response.status_code == 200: return response.content except Exception: pass def parse_index(res): obj=re.compile('class="items.*?<a href="(.*?)"',re.S) detail_urls=obj.findall(res.decode('gbk')) for detail_url in detail_urls: if not detail_url.startswith('http'): detail_url='http://www.xiaohuar.com'+detail_url yield detail_url def parse_detail(res): obj=re.compile('id="media".*?src="(.*?)"',re.S) res=obj.findall(res.decode('gbk')) if len(res) > 0: movie_url=res[0] return movie_url def save(movie_url): response=requests.get(movie_url,stream=False) if response.status_code == 200: m=hashlib.md5() m.update(('%s%s.mp4' %(movie_url,time.time())).encode('utf-8')) filename=m.hexdigest() with open(r'./movies/%s.mp4' %filename,'wb') as f: f.write(response.content) f.flush() def main(): index_url='http://www.xiaohuar.com/list-3-{0}.html' for i in range(5): print('*'*50,i) #爬取主页面 index_page=get_page(index_url.format(i,)) #解析主页面,拿到视频所在的地址列表 detail_urls=parse_index(index_page) #循环爬取视频页 for detail_url in detail_urls: #爬取视频页 detail_page=get_page(detail_url) #拿到视频的url movie_url=parse_detail(detail_page) if movie_url: #保存视频 save(movie_url) if __name__ == '__main__': main() ''' #并发爬取 from concurrent.futures import ThreadPoolExecutor import queue import requests import re import time import hashlib from threading import current_thread p=ThreadPoolExecutor(50) def get_page(url): print('%s GET %s' %(current_thread().getName(),url)) try: response=requests.get(url) if response.status_code == 200: return response.content except Exception as e: print(e) def parse_index(res): print('%s parse index ' %current_thread().getName()) res=res.result() obj=re.compile('class="items.*?<a href="(.*?)"',re.S) detail_urls=obj.findall(res.decode('gbk')) for detail_url in detail_urls: if not detail_url.startswith('http'): detail_url='http://www.xiaohuar.com'+detail_url p.submit(get_page,detail_url).add_done_callback(parse_detail) def parse_detail(res): print('%s parse detail ' %current_thread().getName()) res=res.result() obj=re.compile('id="media".*?src="(.*?)"',re.S) res=obj.findall(res.decode('gbk')) if len(res) > 0: movie_url=res[0] print('MOVIE_URL: ',movie_url) with open('db.txt','a') as f: f.write('%s\n' %movie_url) # save(movie_url) p.submit(save,movie_url) print('%s下载任务已经提交' %movie_url) def save(movie_url): print('%s SAVE: %s' %(current_thread().getName(),movie_url)) try: response=requests.get(movie_url,stream=False) if response.status_code == 200: m=hashlib.md5() m.update(('%s%s.mp4' %(movie_url,time.time())).encode('utf-8')) filename=m.hexdigest() with open(r'./movies/%s.mp4' %filename,'wb') as f: f.write(response.content) f.flush() except Exception as e: print(e) def main(): index_url='http://www.xiaohuar.com/list-3-{0}.html' for i in range(5): p.submit(get_page,index_url.format(i,)).add_done_callback(parse_index) if __name__ == '__main__': main() '''
requests模块
''' import requests response=requests.get('https://www.baidu.com/s?wd=python&pn=1', # wd表示搜索内容 pn=1表示只访问第一页 headers={ # 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' }) # 得到的response就是用浏览器访问时network里的response print(response.text) ''' import requests import re ''' #第一次请求 r1=requests.get('https://github.com/login') # with open('r1.html','w',encoding='utf-8') as f: # f.write(r1.text) r1_cookie=r1.cookies.get_dict() #拿到初始cookie(未被授权) authenticity_token=re.findall(r'name="authenticity_token".*?value="(.*?)"',r1.text)[0] #从页面中拿到CSRF TOKEN # print(r1_cookie,authenticity_token) # {'logged_in': 'no', '_gh_sess': 'Q1ZiK1V5RjBmdTNoYVk0Y3FZRURIVmpMY1hXa3VGVTVFRmh1blN5QVltNGhmQnRFNkFvcmZpRzFST2ZxS2R1S0NZWUdEa0pZcFh2WGt3UE0xR3I4WmhLRUpnOW1aZzhoVVpDVU51UWZVbVJoZHo1UXBoZzVyY3Y2NE9pR1VqbGxZWVZxN3lZdEtqczA2M3BIaythOVlKVHRsdCtmV3hFT1B2WG0rRzRVcFY1Q2gyeGJVeVM5VGh3MllLVkdPdWxsVUpyL2dKeER1cWNxc1ppVjNTMGdwYUVoeE9UVjloa0FnYmIzdUhBbnNLNTZWQUx2WlR0VHYvM2J3VWdKZ2RrdjdHMklzODNFOTVNRTU0NTdMakEyMkV0UE5VY1VyOVBUeG14OUxidGdQOUZSQ2lITlVmVWxkSWpmOHc5Nm5sNWwtLXZBOVl0MmltcjMrQ2JVeGMzQjhVSlE9PQ%3D%3D--fc83b0b0c8be98b0d16b999f3b0dfe3386c47ad3', 'has_recent_activity': '1'} koXm+IFLTc/b5l+QNZBRL+R0sdFlvIY0bAMmof1TadJIUVCXIpmUtLTZQRHZKOFL0dPALof29ASylREm9vsqnw== #第二次请求:带着初始cookie和TOKEN发送POST请求给登录页面,带上账号密码 data={ 'commit':'Sign in', 'utf8':'✓', 'authenticity_token':authenticity_token, 'login':'317828332@qq.com', 'password':'alex3714' } r2=requests.post('https://github.com/session', data=data, cookies=r1_cookie ) login_cookie=r2.cookies.get_dict() print(login_cookie) # {'logged_in': 'yes', '__Host-user_session_same_site': '-Twt5ZngjpZyILQLdG90CU-v7V_TAJEgKRXcWiAUBVeST-nO', '_gh_sess': 'eE9ZRFowWFVlTE9TUXhTTDE0U1BxbWU1WGw2M09vN0dzTGxiZkZhZXQ0Ui9xWEJMd0lJMlc0SHEzYldLMmJRZFpqbXZ1eXY0cWxaazdCeC9kUmhvS2JXdFBlY1h4Um1RZXJHYlVTWGxseldEc1J6OEord3lWUHhJVk5OQXRBMnFvTG13Q3BRd0ZlN3dFbFBTeGlYTXhwNS9FaGlwMldYYnMyQldhWVhwQk5xeTJWZG9URnI5QWF0OGliUzZ3bXRHTFVrWjR0cWZwWEtFQXJTeGtpT1dqbzR0Q0dIV3RDUjlwdmpmZG9zWHlDdz0tLTlKUHFCM0UwNWtNU1A4ZmNhS1hjZHc9PQ%3D%3D--28dc2b8141a93588ab52d166cc13c188c278f4d5', 'has_recent_activity': '1', 'user_session': '-Twt5ZngjpZyILQLdG90CU-v7V_TAJEgKRXcWiAUBVeST-nO'} # 总结下,首先使用get请求的到初始cookie(未被授权)和CSRF TOKEN,然后使用post发送带着初始cookie和TOKEN发送POST请求给登录页面,带上账号密码 #第三次请求:以后的登录,拿着login_cookie就可以,比如访问一些个人配置 login_cookie = {'logged_in': 'yes', '__Host-user_session_same_site': 'Q3_oQ9DLwIZ5Z4vCrlrEtw4PSd8E8WLN7ftycrLINDmyLU3Y', '_gh_sess': 'UE9ZTVdFRHpCS2sreEpVRFNKUTR1NnE2VGVRMkU2bzZUa3p5YkpzQTFaS0wxTHBZSlFsaUpyeVlhdWNoZy9qTGkzaW9DNGNvbzMzdStnY3JzQXFlUElpT3RVSkk4RFAza2Zac3ZWWTdDL0hCY0p4aU9RbnBNMERqMXFUNEk1OUNUeVhCZTljTS9IeUxsK3FISEhrZjJHL0kvVVhjV2hkVzBPd292SmUxVEx1L2hRWHlGSDVPUTdXU2ZhNzRRSzBHdUpkREdOVjlSRy94Q0FBVUFjY05wOGQ5MEdUMkEyOHQ1RDNRUnRCeDNFYz0tLU5FR013M3IvY3d2UTlRbDFoamk2VGc9PQ%3D%3D--4b83733a566892557a9ba486333a49c1b72d13f3', 'has_recent_activity': '1', 'user_session': 'Q3_oQ9DLwIZ5Z4vCrlrEtw4PSd8E8WLN7ftycrLINDmyLU3Y'} r3=requests.get('https://github.com/settings/emails', cookies=login_cookie) with open('r3.html','w',encoding='utf-8') as f: f.write(r3.text) print('317828332@qq.com' in r3.text) #True ''' import requests respone=requests.get('http://www.baidu.com') # respone属性 print('respone.text',respone.text) print('*'*50) print('content',respone.content) print('*'*50) print('status_code',respone.status_code) print('*'*50) print('headers',respone.headers) print('*'*50) print('cookies',respone.cookies) print('*'*50) print('get_dict',respone.cookies.get_dict()) print('*'*50) print('items',respone.cookies.items()) print('*'*50) print('url',respone.url) print('*'*50) print('history',respone.history) print('*'*50) print('encoding',respone.encoding) ''' respone.text <!DOCTYPE html> <!--STATUS OK--><html> <head><meta http-equiv=content-type content=text/html;charset=utf-8><meta http-equiv=X-UA-Compatible content=IE=Edge><meta content=always name=referrer><link rel=stylesheet type=text/css href=http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css><title>ç¾åº¦ä¸ä¸ï¼ä½ å°±ç¥é</title></head> <body link=#0000cc> <div id=wrapper> <div id=head> <div class=head_wrapper> <div class=s_form> <div class=s_form_wrapper> <div id=lg> <img hidefocus=true src=//www.baidu.com/img/bd_logo1.png width=270 height=129> </div> <form id=form name=f action=//www.baidu.com/s class=fm> <input type=hidden name=bdorz_come value=1> <input type=hidden name=ie value=utf-8> <input type=hidden name=f value=8> <input type=hidden name=rsv_bp value=1> <input type=hidden name=rsv_idx value=1> <input type=hidden name=tn value=baidu><span class="bg s_ipt_wr"><input id=kw name=wd class=s_ipt value maxlength=255 autocomplete=off autofocus></span><span class="bg s_btn_wr"><input type=submit id=su value=ç¾åº¦ä¸ä¸ class="bg s_btn"></span> </form> </div> </div> <div id=u1> <a href=http://news.baidu.com name=tj_trnews class=mnav>æ°é»</a> <a href=http://www.hao123.com name=tj_trhao123 class=mnav>hao123</a> <a href=http://map.baidu.com name=tj_trmap class=mnav>å°å¾</a> <a href=http://v.baidu.com name=tj_trvideo class=mnav>è§é¢</a> <a href=http://tieba.baidu.com name=tj_trtieba class=mnav>è´´å§</a> <noscript> <a href=http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1 name=tj_login class=lb>ç»å½</a> </noscript> <script>document.write('<a href="http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u='+ encodeURIComponent(window.location.href+ (window.location.search === "" ? "?" : "&")+ "bdorz_come=1")+ '" name="tj_login" class="lb">ç»å½</a>');</script> <a href=//www.baidu.com/more/ name=tj_briicon class=bri style="display: block;">æ´å¤äº§å</a> </div> </div> </div> <div id=ftCon> <div id=ftConw> <p id=lh> <a href=http://home.baidu.com>å³äºç¾åº¦</a> <a href=http://ir.baidu.com>About Baidu</a> </p> <p id=cp>©2017 Baidu <a href=http://www.baidu.com/duty/>使ç¨ç¾åº¦åå¿è¯»</a> <a href=http://jianyi.baidu.com/ class=cp-feedback>æè§åé¦</a> 京ICPè¯030173å· <img src=//www.baidu.com/img/gs.gif> </p> </div> </div> </div> </body> </html> ************************************************** content b'<!DOCTYPE html>\r\n<!--STATUS OK--><html> <head><meta http-equiv=content-type content=text/html;charset=utf-8><meta http-equiv=X-UA-Compatible content=IE=Edge><meta content=always name=referrer><link rel=stylesheet type=text/css href=http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css><title>\xe7\x99\xbe\xe5\xba\xa6\xe4\xb8\x80\xe4\xb8\x8b\xef\xbc\x8c\xe4\xbd\xa0\xe5\xb0\xb1\xe7\x9f\xa5\xe9\x81\x93</title></head> <body link=#0000cc> <div id=wrapper> <div id=head> <div class=head_wrapper> <div class=s_form> <div class=s_form_wrapper> <div id=lg> <img hidefocus=true src=//www.baidu.com/img/bd_logo1.png width=270 height=129> </div> <form id=form name=f action=//www.baidu.com/s class=fm> <input type=hidden name=bdorz_come value=1> <input type=hidden name=ie value=utf-8> <input type=hidden name=f value=8> <input type=hidden name=rsv_bp value=1> <input type=hidden name=rsv_idx value=1> <input type=hidden name=tn value=baidu><span class="bg s_ipt_wr"><input id=kw name=wd class=s_ipt value maxlength=255 autocomplete=off autofocus></span><span class="bg s_btn_wr"><input type=submit id=su value=\xe7\x99\xbe\xe5\xba\xa6\xe4\xb8\x80\xe4\xb8\x8b class="bg s_btn"></span> </form> </div> </div> <div id=u1> <a href=http://news.baidu.com name=tj_trnews class=mnav>\xe6\x96\xb0\xe9\x97\xbb</a> <a href=http://www.hao123.com name=tj_trhao123 class=mnav>hao123</a> <a href=http://map.baidu.com name=tj_trmap class=mnav>\xe5\x9c\xb0\xe5\x9b\xbe</a> <a href=http://v.baidu.com name=tj_trvideo class=mnav>\xe8\xa7\x86\xe9\xa2\x91</a> <a href=http://tieba.baidu.com name=tj_trtieba class=mnav>\xe8\xb4\xb4\xe5\x90\xa7</a> <noscript> <a href=http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1 name=tj_login class=lb>\xe7\x99\xbb\xe5\xbd\x95</a> </noscript> <script>document.write(\'<a href="http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u=\'+ encodeURIComponent(window.location.href+ (window.location.search === "" ? "?" : "&")+ "bdorz_come=1")+ \'" name="tj_login" class="lb">\xe7\x99\xbb\xe5\xbd\x95</a>\');</script> <a href=//www.baidu.com/more/ name=tj_briicon class=bri style="display: block;">\xe6\x9b\xb4\xe5\xa4\x9a\xe4\xba\xa7\xe5\x93\x81</a> </div> </div> </div> <div id=ftCon> <div id=ftConw> <p id=lh> <a href=http://home.baidu.com>\xe5\x85\xb3\xe4\xba\x8e\xe7\x99\xbe\xe5\xba\xa6</a> <a href=http://ir.baidu.com>About Baidu</a> </p> <p id=cp>©2017 Baidu <a href=http://www.baidu.com/duty/>\xe4\xbd\xbf\xe7\x94\xa8\xe7\x99\xbe\xe5\xba\xa6\xe5\x89\x8d\xe5\xbf\x85\xe8\xaf\xbb</a> <a href=http://jianyi.baidu.com/ class=cp-feedback>\xe6\x84\x8f\xe8\xa7\x81\xe5\x8f\x8d\xe9\xa6\x88</a> \xe4\xba\xacICP\xe8\xaf\x81030173\xe5\x8f\xb7 <img src=//www.baidu.com/img/gs.gif> </p> </div> </div> </div> </body> </html>\r\n' ************************************************** status_code 200 ************************************************** headers {'Cache-Control': 'private, no-cache, no-store, proxy-revalidate, no-transform', 'Connection': 'Keep-Alive', 'Content-Encoding': 'gzip', 'Content-Type': 'text/html', 'Date': 'Sun, 26 Aug 2018 09:02:09 GMT', 'Last-Modified': 'Mon, 23 Jan 2017 13:27:36 GMT', 'Pragma': 'no-cache', 'Server': 'bfe/1.0.8.18', 'Set-Cookie': 'BDORZ=27315; max-age=86400; domain=.baidu.com; path=/', 'Transfer-Encoding': 'chunked'} ************************************************** cookies <RequestsCookieJar[<Cookie BDORZ=27315 for .baidu.com/>]> ************************************************** get_dict {'BDORZ': '27315'} ************************************************** items [('BDORZ', '27315')] ************************************************** url http://www.baidu.com/ ************************************************** history [] ************************************************** encoding ISO-8859-1 ''' #关闭:response.close() # from contextlib import closing # with closing(requests.get('xxx',stream=True)) as response: # for line in response.iter_content(): # pass
''' 编码问题 import requests response=requests.get('http://www.autohome.com/news') response.encoding='gbk' #汽车之家网站返回的页面内容为gb2312编码的,而requests的默认编码为ISO-8859-1,如果不设置成gbk则中文乱码 print(response.text) ------------------------------------------------------------------------------------------------------ import requests response=requests.get('https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1509868306530&di=712e4ef3ab258b36e9f4b48e85a81c9d&imgtype=0&src=http%3A%2F%2Fc.hiphotos.baidu.com%2Fimage%2Fpic%2Fitem%2F11385343fbf2b211e1fb58a1c08065380dd78e0c.jpg') with open('a.jpg','wb') as f: f.write(response.content) ----------------------------------------------------------------------------------------------------- #stream参数:一点一点的取,比如下载视频时,如果视频100G,用response.content然后一下子写到文件中是不合理的 import requests response=requests.get('https://gss3.baidu.com/6LZ0ej3k1Qd3ote6lo7D0j9wehsv/tieba-smallvideo-transcode/1767502_56ec685f9c7ec542eeaf6eac93a65dc7_6fe25cd1347c_3.mp4', stream=True) with open('b.mp4','wb') as f: for line in response.iter_content(): f.write(line) # 获取二进制流 ----------------------------------------------------------------------------------------------------- #解析json import requests response=requests.get('http://httpbin.org/get') import json res1=json.loads(response.text) #太麻烦 res2=response.json() #直接获取json数据 print(res1 == res2) #True ----------------------------------------------------------------------------------------------- Redirection and History import requests import re #第一次请求 r1=requests.get('https://github.com/login') r1_cookie=r1.cookies.get_dict() #拿到初始cookie(未被授权) authenticity_token=re.findall(r'name="authenticity_token".*?value="(.*?)"',r1.text)[0] #从页面中拿到CSRF TOKEN #第二次请求:带着初始cookie和TOKEN发送POST请求给登录页面,带上账号密码 data={ 'commit':'Sign in', 'utf8':'✓', 'authenticity_token':authenticity_token, 'login':'317828332@qq.com', 'password':'alex3714' } #测试一:没有指定allow_redirects=False,则响应头中出现Location就跳转到新页面,r2代表新页面的response r2=requests.post('https://github.com/session', data=data, cookies=r1_cookie ) print(r2.status_code) #200 print(r2.url) #看到的是跳转后的页面 print(r2.history) #看到的是跳转前的response print(r2.history[0].text) #看到的是跳转前的response.text #测试二:指定allow_redirects=False,则响应头中即便出现Location也不会跳转到新页面,r2代表的仍然是老页面的response r2=requests.post('https://github.com/session', data=data, cookies=r1_cookie, allow_redirects=False ) print(r2.status_code) #302 print(r2.url) #看到的是跳转前的页面https://github.com/session print(r2.history) #[] 利用github登录后跳转到主页面的例子来验证它 ------------------------------------------------------------------------------------------------------------- SSL Cert Verification #证书验证(大部分网站都是https) import requests respone=requests.get('https://www.12306.cn') #如果是ssl请求,首先检查证书是否合法,不合法则报错,程序终端 #改进1:去掉报错,但是会报警告 import requests respone=requests.get('https://www.12306.cn',verify=False) #不验证证书,报警告,返回200 print(respone.status_code) #改进2:去掉报错,并且去掉警报信息 import requests from requests.packages import urllib3 urllib3.disable_warnings() #关闭警告 respone=requests.get('https://www.12306.cn',verify=False) print(respone.status_code) #改进3:加上证书 #很多网站都是https,但是不用证书也可以访问,大多数情况都是可以携带也可以不携带证书 #知乎\百度等都是可带可不带 #有硬性要求的,则必须带,比如对于定向的用户,拿到证书后才有权限访问某个特定网站 import requests respone=requests.get('https://www.12306.cn', cert=('/path/server.crt', '/path/key')) print(respone.status_code) ---------------------------------------------------------------------------------------------------- 代理设置 #官网链接: http://docs.python-requests.org/en/master/user/advanced/#proxies #代理设置:先发送请求给代理,然后由代理帮忙发送(封ip是常见的事情) import requests proxies={ 'http':'http://egon:123@localhost:9743',#带用户名密码的代理,@符号前是用户名与密码 'http':'http://localhost:9743', 'https':'https://localhost:9743', } respone=requests.get('https://www.12306.cn', proxies=proxies) print(respone.status_code) #支持socks代理,安装:pip install requests[socks] import requests proxies = { 'http': 'socks5://user:pass@host:port', 'https': 'socks5://user:pass@host:port' } respone=requests.get('https://www.12306.cn', proxies=proxies) print(respone.status_code) ------------------------------------------------------------------------------------------- 超时设置 #两种超时:float or tuple #timeout=0.1 #代表接收数据的超时时间 #timeout=(0.1,0.2)#0.1代表链接超时 0.2代表接收数据的超时时间 import requests respone=requests.get('https://www.baidu.com', timeout=0.0001) # 测试两秒就不超时了 ------------------------------------------------------------------------------------------------- 认证设置 #官网链接:http://docs.python-requests.org/en/master/user/authentication/ #认证设置:登陆网站是,弹出一个框,要求你输入用户名密码(与alter很类似),此时是无法获取html的 # 但本质原理是拼接成请求头发送 # r.headers['Authorization'] = _basic_auth_str(self.username, self.password) # 一般的网站都不用默认的加密方式,都是自己写 # 那么我们就需要按照网站的加密方式,自己写一个类似于_basic_auth_str的方法 # 得到加密字符串后添加到请求头 # r.headers['Authorization'] =func('.....') #看一看默认的加密方式吧,通常网站都不会用默认的加密设置 import requests from requests.auth import HTTPBasicAuth r=requests.get('xxx',auth=HTTPBasicAuth('user','password')) print(r.status_code) #HTTPBasicAuth可以简写为如下格式 import requests r=requests.get('xxx',auth=('user','password')) print(r.status_code) -------------------------------------------------------------------------------------- #异常处理 import requests from requests.exceptions import * #可以查看requests.exceptions获取异常类型 try: r=requests.get('http://www.baidu.com',timeout=0.00001) except ReadTimeout: print('===:') # except ConnectionError: #网络不通 # print('-----') # except Timeout: # print('aaaaa') except RequestException: print('Error') ------------------------------------------------------------------------------------------ 上传文件 import requests files={'file':open('a.jpg','rb')} respone=requests.post('http://httpbin.org/post',files=files) print(respone.status_code) '''
selenium 驱动浏览器
selenium最初是一个自动化测试工具,而爬虫中使用它主要是为了解决requests无法直接执行JavaScript代码的问题
selenium本质是通过驱动浏览器,完全模拟浏览器的操作,比如跳转、输入、点击、下拉等,来拿到网页渲染之后的结果,可支持多种浏览器
# """ # #安装:selenium+chromedriver 有界面浏览器 # pip3 install selenium # 下载chromdriver.exe放到python安装路径的scripts目录中即可,注意最新版本是2.38,并非2.9 # 国内镜像网站地址:http://npm.taobao.org/mirrors/chromedriver/2.38/ # 最新的版本去官网找:https://sites.google.com/a/chromium.org/chromedriver/downloads # # #验证安装 # C:\Users\Administrator>python3 # Python 3.6.1 (v3.6.1:69c0db5, Mar 21 2017, 18:41:36) [MSC v.1900 64 bit (AMD64)] on win32 # Type "help", "copyright", "credits" or "license" for more information. # >>> from selenium import webdriver # >>> driver=webdriver.Chrome() #弹出浏览器 # >>> driver.get('https://www.baidu.com') # >>> driver.page_source # # #注意: # selenium3默认支持的webdriver是Firfox,而Firefox需要安装geckodriver # 下载链接:https://github.com/mozilla/geckodriver/releases # # # # # # # # selenium+谷歌浏览器headless模式 无界面浏览器 # # #selenium:3.12.0 # #webdriver:2.38 # #chrome.exe: 65.0.3325.181(正式版本) (32 位) # # from selenium import webdriver # from selenium.webdriver.chrome.options import Options # chrome_options = Options() # chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率 # chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug # chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面 # chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度 # chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败 # chrome_options.binary_location = r"C:\Users\afly\AppData\Local\Google\Chrome\Application\chrome.exe" #手动指定使用的浏览器位置 # # # driver=webdriver.Chrome(chrome_options=chrome_options) # driver.get('https://www.baidu.com') # # print(driver.page_source) # # print('hao123' in driver.page_source) # # # driver.close() #切记关闭浏览器,回收资源 # """ ''' 基本使用 from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #键盘按键操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素 browser=webdriver.Chrome() try: browser.get('https://www.baidu.com') input_tag=browser.find_element_by_id('kw') input_tag.send_keys('美女') #python2中输入中文错误,字符串前加个u input_tag.send_keys(Keys.ENTER) #输入回车 wait=WebDriverWait(browser,10) wait.until(EC.presence_of_element_located((By.ID,'content_left'))) #等到id为content_left的元素加载完毕,最多等10秒 print(browser.page_source) print(browser.current_url) print(browser.get_cookies()) finally: browser.close() ''' """ 选择器 基本用法 下面的例子是登录百度账号 #官网链接:http://selenium-python.readthedocs.io/locating-elements.html from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #键盘按键操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素 import time driver=webdriver.Chrome() driver.get('https://www.baidu.com') wait=WebDriverWait(driver,10) try: #===============所有方法=================== # 1、find_element_by_id # 2、find_element_by_link_text # 3、find_element_by_partial_link_text # 4、find_element_by_tag_name # 5、find_element_by_class_name # 6、find_element_by_name # 7、find_element_by_css_selector # 8、find_element_by_xpath # 强调: # 1、上述均可以改写成find_element(By.ID,'kw')的形式 # 2、find_elements_by_xxx的形式是查找到多个元素,结果为列表 #===============示范用法=================== # 1、find_element_by_id print('driver.find_element_by_id(kw)',driver.find_element_by_id('kw')) # driver.find_element_by_id(kw) <selenium.webdriver.remote.webelement.WebElement (session="9c700746c88efecdf2433feea3ccd4b3", element="0.8758959721944797-1")> # 2、find_element_by_link_text # login=driver.find_element_by_link_text('登录') # login.click() # 3、find_element_by_partial_link_text login=driver.find_elements_by_partial_link_text('录')[0] login.click() # 4、find_element_by_tag_name print('driver.find_element_by_tag_name(a)',driver.find_element_by_tag_name('a')) # driver.find_element_by_tag_name(a) <selenium.webdriver.remote.webelement.WebElement (session="9c700746c88efecdf2433feea3ccd4b3", element="0.8758959721944797-3")> # 5、find_element_by_class_name button=wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'tang-pass-footerBarULogin'))) # tang-pass-footerBarULogin这个是用户名登录这个p标签的一个class button.click() # 6、find_element_by_name input_user=wait.until(EC.presence_of_element_located((By.NAME,'userName'))) # 这个是input输入框,input的属性name='userName' input_pwd=wait.until(EC.presence_of_element_located((By.NAME,'password'))) # 密码 input输入框 name commit=wait.until(EC.element_to_be_clickable((By.ID,'TANGRAM__PSP_10__submit'))) # 登录按钮id input_user.send_keys('18611453110') input_pwd.send_keys('xxxxxx') commit.click() # 7、find_element_by_css_selector driver.find_element_by_css_selector('#kw') # 8、find_element_by_xpath time.sleep(5) finally: driver.close() """ ''' 二. xpath # 官网链接:http://selenium-python.readthedocs.io/locating-elements.html from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By # 按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys # 键盘按键操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素 import time driver = webdriver.PhantomJS() # 这里使用的是PhantomJS,我还没安装 driver.get('https://doc.scrapy.org/en/latest/_static/selectors-sample1.html') # wait=WebDriverWait(driver,3) driver.implicitly_wait(3) # 使用隐式等待 try: # find_element_by_xpath # //与/ # driver.find_element_by_xpath('//body/a') # 开头的//代表从整篇文档中寻找,body之后的/代表body的儿子,这一行找不到就会报错了 ret1 = driver.find_element_by_xpath('//body//a') # 开头的//代表从整篇文档中寻找,body之后的//代表body的子子孙孙 ret2 = driver.find_element_by_css_selector('body a') print(ret1) print('*'*50) print(ret2) # 取第n个 res1 = driver.find_elements_by_xpath('//body//a[1]') # 取第一个a标签 print(res1[0].text) # 按照属性查找,下述三者查找效果一样 res1 = driver.find_element_by_xpath('//a[5]') res2 = driver.find_element_by_xpath('//a[@href="image5.html"]') res3 = driver.find_element_by_xpath('//a[contains(@href,"image5")]') # 模糊查找 print('==>', res1.text) print('==>', res2.text) print('==>', res3.text) # 其他 res1 = driver.find_element_by_xpath('/html/body/div/a') print(res1.text) res2 = driver.find_element_by_xpath('//a[img/@src="image3_thumb.jpg"]') # 找到子标签img的src属性为image3_thumb.jpg的a标签 print(res2.tag_name, res2.text) res3 = driver.find_element_by_xpath("//input[@name='continue'][@type='button']") # 查看属性name为continue且属性type为button的input标签 res4 = driver.find_element_by_xpath("//*[@name='continue'][@type='button']") # 查看属性name为continue且属性type为button的所有标签 time.sleep(5) finally: driver.close() ''' """ # 获取标签属性 from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #键盘按键操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素 browser=webdriver.Chrome() browser.get('https://www.amazon.cn/') wait=WebDriverWait(browser,10) wait.until(EC.presence_of_element_located((By.ID,'cc-lm-tcgShowImgContainer'))) tag=browser.find_element(By.CSS_SELECTOR,'#cc-lm-tcgShowImgContainer img') #获取标签属性, print(tag.get_attribute('src')) #获取标签ID,位置,名称,大小(了解) print(tag.id) print(tag.location) print(tag.tag_name) print(tag.size) browser.close() """ """ # 等待元素被加载 # #1、selenium只是模拟浏览器的行为,而浏览器解析页面是需要时间的(执行css,js),一些元素可能需要过一段时间才能加载出来,为了保证能查找到元素,必须等待 # #2、等待的方式分两种: # 隐式等待:在browser.get('xxx')前就设置,针对所有元素有效 # 显式等待:在browser.get('xxx')之后设置,只针对某个元素有效 例子: from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #键盘按键操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素 browser=webdriver.Chrome() #隐式等待:在查找所有元素时,如果尚未被加载,则等10秒 browser.implicitly_wait(10) browser.get('https://www.baidu.com') input_tag=browser.find_element_by_id('kw') input_tag.send_keys('美女') input_tag.send_keys(Keys.ENTER) contents=browser.find_element_by_id('content_left') #没有等待环节而直接查找,找不到则会报错 print(contents) browser.close() from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #键盘按键操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素 browser=webdriver.Chrome() browser.get('https://www.baidu.com') input_tag=browser.find_element_by_id('kw') input_tag.send_keys('美女') input_tag.send_keys(Keys.ENTER) #显式等待:显式地等待某个元素被加载 wait=WebDriverWait(browser,10) wait.until(EC.presence_of_element_located((By.ID,'content_left'))) contents=browser.find_element(By.CSS_SELECTOR,'#content_left') print(contents) browser.close() """ """ 六 .元素交互操作 from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #键盘按键操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素 browser=webdriver.Chrome() browser.get('https://www.amazon.cn/') wait=WebDriverWait(browser,10) input_tag=wait.until(EC.presence_of_element_located((By.ID,'twotabsearchtextbox'))) input_tag.send_keys('iphone 8') button=browser.find_element_by_css_selector('#nav-search > form > div.nav-right > div > input') button.click() import time time.sleep(3) input_tag=browser.find_element_by_id('twotabsearchtextbox') input_tag.clear() #清空输入框 input_tag.send_keys('iphone7plus') button=browser.find_element_by_css_selector('#nav-search > form > div.nav-right > div > input') button.click() browser.close() """ """ from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By # 按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys # 键盘按键操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素 import time driver = webdriver.Chrome() driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') wait=WebDriverWait(driver,3) # driver.implicitly_wait(3) # 使用隐式等待 try: driver.switch_to.frame('iframeResult') ##切换到iframeResult sourse=driver.find_element_by_id('draggable') target=driver.find_element_by_id('droppable') #方式一:基于同一个动作链串行执行 actions=ActionChains(driver) #拿到动作链对象 actions.drag_and_drop(sourse,target) #把动作放到动作链中,准备串行执行 actions.perform() #方式二:不同的动作链,每次移动的位移都不同 ''' ActionChains(driver).click_and_hold(sourse).perform() distance=target.location['x']-sourse.location['x'] track=0 while track < distance: ActionChains(driver).move_by_offset(xoffset=2,yoffset=0).perform() track+=2 time.sleep(0.5) ActionChains(driver).release().perform() ''' time.sleep(10) finally: driver.close() """ """ # 在交互动作比较难实现的时候可以自己写JS(万能方法) from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #键盘按键操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素 try: browser=webdriver.Chrome() browser.get('https://www.baidu.com') browser.execute_script('alert("hello world")') #打印警告 execute_script()这个函数可以执行javascript命令 finally: pass # browser.close() """ """ # 补充:frame的切换 #frame相当于一个单独的网页,在父frame里是无法直接查看到子frame的元素的,必须switch_to_frame切到该frame下,才能进一步查找 # 在本例中访问的网页使用了frame from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #键盘按键操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素 try: browser=webdriver.Chrome() browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') browser.switch_to.frame('iframeResult') #切换到id为iframeResult的frame tag1=browser.find_element_by_id('droppable') print(tag1) # tag2=browser.find_element_by_id('textareaCode') #报错,在子frame里无法查看到父frame的元素 browser.switch_to.parent_frame() #切回父frame,就可以查找到了 tag2=browser.find_element_by_id('textareaCode') # 这是那个代码的输入框 print(tag2) finally: browser.close() """
""" #模拟浏览器的前进后退 import time from selenium import webdriver browser=webdriver.Chrome() browser.get('https://www.baidu.com') browser.get('https://www.taobao.com') browser.get('http://www.sina.com.cn/') browser.back() time.sleep(10) browser.forward() browser.close() """ """ #cookies from selenium import webdriver browser=webdriver.Chrome() browser.get('https://www.zhihu.com/explore') print(browser.get_cookies()) browser.add_cookie({'k1':'xxx','k2':'yyy'}) print(browser.get_cookies()) # browser.delete_all_cookies() """ """ #选项卡管理(多个网页窗口的管理):切换选项卡,有js的方式windows.open,有windows快捷键:ctrl+t等,最通用的就是js的方式 import time from selenium import webdriver browser=webdriver.Chrome() browser.get('https://www.baidu.com') browser.execute_script('window.open()') print(browser.window_handles) #获取所有的选项卡 # 横线表示可以运行,但是pyharm不建议使用这种方法,有更好的方法替代 使用 switch_to.window 代替 switch_to_window # 参见博客 https://blog.csdn.net/ccggaag/article/details/76652274 browser.switch_to_window(browser.window_handles[1]) browser.get('https://www.taobao.com') time.sleep(10) browser.switch_to_window(browser.window_handles[0]) browser.get('https://www.sina.com.cn') browser.close() """ """ # 异常处理 from selenium import webdriver from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException try: browser=webdriver.Chrome() browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') browser.switch_to.frame('iframssseResult') except TimeoutException as e: print(e) except NoSuchFrameException as e: print(e) finally: browser.close() """ """ # 自动登录163邮箱并发送邮件 #注意:网站都策略都是在不断变化的,精髓在于学习流程。下述代码生效与2017-11-7,不能保证永久有效 from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait browser=webdriver.Chrome() try: browser.get('http://mail.163.com/') wait=WebDriverWait(browser,5) frame=wait.until(EC.presence_of_element_located((By.ID,'x-URS-iframe'))) browser.switch_to.frame(frame) wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-container'))) inp_user=browser.find_element_by_name('email') inp_pwd=browser.find_element_by_name('password') button=browser.find_element_by_id('dologin') inp_user.send_keys('18611453110') inp_pwd.send_keys('xxxx') button.click() #如果遇到验证码,可以把下面一小段打开注释 # import time # time.sleep(10) # button = browser.find_element_by_id('dologin') # button.click() wait.until(EC.presence_of_element_located((By.ID,'dvNavTop'))) write_msg=browser.find_elements_by_css_selector('#dvNavTop li')[1] #获取第二个li标签就是“写信”了 write_msg.click() wait.until(EC.presence_of_element_located((By.CLASS_NAME,'tH0'))) recv_man=browser.find_element_by_class_name('nui-editableAddr-ipt') title=browser.find_element_by_css_selector('.dG0 .nui-ipt-input') recv_man.send_keys('378533872@qq.com') title.send_keys('圣旨') print(title.tag_name) frame=wait.until(EC.presence_of_element_located((By.CLASS_NAME,'APP-editor-iframe'))) browser.switch_to.frame(frame) body=browser.find_element(By.CSS_SELECTOR,'body') body.send_keys('egon很帅,可以加工资了') browser.switch_to.parent_frame() #切回他爹 send_button=browser.find_element_by_class_name('nui-toolbar-item') send_button.click() #可以睡时间久一点别让浏览器关掉,看看发送成功没有 import time time.sleep(10000) except Exception as e: print(e) finally: browser.close() """ """ # 爬取京东商城商品信息 from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #键盘按键操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素 import time def get_goods(driver): try: goods=driver.find_elements_by_class_name('gl-item') for good in goods: detail_url=good.find_element_by_tag_name('a').get_attribute('href') p_name=good.find_element_by_css_selector('.p-name em').text.replace('\n','') price=good.find_element_by_css_selector('.p-price i').text p_commit=good.find_element_by_css_selector('.p-commit a').text msg = ''' 商品 : %s 链接 : %s 价钱 :%s 评论 :%s ''' % (p_name,detail_url,price,p_commit) print(msg,end='\n\n') button=driver.find_element_by_partial_link_text('下一页') button.click() time.sleep(1) get_goods(driver) except Exception: pass def spider(url,keyword): driver = webdriver.Chrome() driver.get(url) driver.implicitly_wait(3) # 使用隐式等待 try: input_tag=driver.find_element_by_id('key') input_tag.send_keys(keyword) input_tag.send_keys(Keys.ENTER) get_goods(driver) finally: driver.close() if __name__ == '__main__': spider('https://www.jd.com/',keyword='iPhone8手机') """
beautiful soup 基本用法
#
# html文本
# html_doc = """ # <html><head><title>The Dormouse's story</title></head> # <body> # <p class="title"><b>The Dormouse's story</b></p> # # <p class="story">Once upon a time there were three little sisters; and their names were # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; # and they lived at the bottom of a well.</p> # # <p class="story">...</p> # """ # # #基本使用:容错处理,文档的容错能力指的是在html代码不完整的情况下,使用该模块可以识别该错误。使用BeautifulSoup解析上述代码,能够得到一个 BeautifulSoup 的对象,并能按照标准的缩进格式的结构输出 # from bs4 import BeautifulSoup # soup=BeautifulSoup(html_doc,'lxml') #具有容错功能 # res=soup.prettify() #处理好缩进,结构化显示 # print(res) #遍历文档树:即直接通过标签名字选择,特点是选择速度快,但如果存在多个相同的标签则只返回第一个 html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p id="my p" class="title"><b id="bbb" class="boldest">The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # #1、用法 from bs4 import BeautifulSoup # soup=BeautifulSoup(html_doc,'lxml') # # soup=BeautifulSoup(open('a.html'),'lxml') # # print(soup.p) #存在多个相同的标签则只返回第一个 <p class="title" id="my p"><b class="boldest" id="bbb">The Dormouse's story</b></p> # print(soup.a) #存在多个相同的标签则只返回第一个 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> # # #2、获取标签的名称 # print(soup.p.name) # p # # #3、获取标签的属性 # print(soup.p.attrs) #{'id': 'my p', 'class': ['title']} # # #4、获取标签的内容 # print(soup.p.string,111) # p下的文本只有一个时,取到,否则为None # print(soup.p.strings,222) #拿到一个生成器对象, 取到p下所有的文本内容 # print(soup.p.text,333) #取到p下所有的文本内容 # for line in soup.stripped_strings: #去掉空白 # print(line,444) ''' The Dormouse's story 111 <generator object _all_strings at 0x00000186C64B5F68> 222 The Dormouse's story 333 The Dormouse's story 444 The Dormouse's story 444 Once upon a time there were three little sisters; and their names were 444 Elsie 444 , 444 Lacie 444 and 444 Tillie 444 ; and they lived at the bottom of a well. 444 ... 444 ''' # ''' # 如果tag包含了多个子节点,tag就无法确定 .string 方法应该调用哪个子节点的内容, .string 的输出结果是 None,\ # 如果只有一个子节点那么就输出该子节点的文本,比如下面的这种结构,soup.p.string 返回为None,但soup.p.strings就可以找到所有文本 html_doc2 = '''<p id='list-1'> 哈哈哈哈 <a class='sss'> <span> <h1>aaaa</h1> </span> </a> <b>bbbbb</b> </p> ''' # soup=BeautifulSoup(html_doc2,'lxml') # print(soup.p.string) # print(soup.p.strings) # for line in soup.p.strings: # print(line) # # #5、嵌套选择 # print(soup.head.title.string) # print(soup.body.a.string) # # # #6、子节点、子孙节点 # print(soup.p.contents) #p下所有子节点 # print(soup.p.children) #得到一个迭代器,包含p下所有子节点 # # for i,child in enumerate(soup.p.children): # print(i,child) # # print(soup.p.descendants) #获取子孙节点,p下所有的标签都会选择出来 # for i,child in enumerate(soup.p.descendants): # print(i,child) # # #7、父节点、祖先节点 # print(soup.a.parent) #获取a标签的父节点 # print(soup.a.parents) #找到a标签所有的祖先节点,父亲的父亲,父亲的父亲的父亲... # # # #8、兄弟节点 # print('=====>') # print(soup.a.next_sibling) #下一个兄弟 # print(soup.a.previous_sibling) #上一个兄弟 # # print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象 # print(soup.a.previous_siblings) #上面的兄弟们=>生成器对象 ---------------------------------------------------------------------------------------------------------------------------------------------------- #搜索文档树:BeautifulSoup定义了很多搜索方法,这里着重介绍2个: find() 和 find_all() .其它方法的参数和用法类似 html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p id="my p" class="title"><b id="bbb" class="boldest">The Dormouse's story</b> </p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ ''' from bs4 import BeautifulSoup soup=BeautifulSoup(html_doc,'lxml') #1、五种过滤器: 字符串、正则表达式、列表、True、方法 #1.1、字符串:即标签名 print(soup.find_all('b')) #1.2、正则表达式 import re print(soup.find_all(re.compile('^b'))) #找出b开头的标签,结果有body和b标签 #1.3、列表:如果传入列表参数,Beautiful Soup会将与列表中任一元素匹配的内容返回.下面代码找到文档中所有<a>标签和<b>标签: print(soup.find_all(['a','b'])) #1.4、True:可以匹配任何值,下面代码查找到所有的tag,但是不会返回字符串节点 print(soup.find_all(True)) for tag in soup.find_all(True): print(tag.name) #1.5、方法:如果没有合适过滤器,那么还可以定义一个方法,方法只接受一个元素参数 ,如果这个方法返回 True 表示当前元素匹配并且被找到,如果不是则反回 False def has_class_but_no_id(tag): return tag.has_attr('class') and not tag.has_attr('id') print(soup.find_all(has_class_but_no_id)) ''' ---------------------------------------------------------------------------------------------------------------------------------------------- ''' find_all() 的使用 from bs4 import BeautifulSoup import re soup=BeautifulSoup(html_doc,'lxml') #2、find_all( name , attrs , recursive , text , **kwargs ) #2.1、name: 搜索name参数的值可以使任一类型的 过滤器 ,字符窜,正则表达式,列表,方法或是 True . print(soup.find_all(name=re.compile('^t'))) #2.2、keyword: key=value的形式,value可以是过滤器:字符串 , 正则表达式 , 列表, True . print(soup.find_all(id=re.compile('my'))) print(soup.find_all(href=re.compile('lacie'),id=re.compile('\d'))) #注意类要用class_ print(soup.find_all(id=True)) #查找有id属性的标签 # 有些tag属性在搜索不能使用,比如HTML5中的 data-* 属性: data_soup = BeautifulSoup('<div data-foo="value">foo!</div>','lxml') # data_soup.find_all(data-foo="value") #报错:SyntaxError: keyword can't be an expression # 但是可以通过 find_all() 方法的 attrs 参数定义一个字典参数来搜索包含特殊属性的tag: print(data_soup.find_all(attrs={"data-foo": "value"})) # [<div data-foo="value">foo!</div>] #2.3、按照类名查找,注意关键字是class_,class_=value,value可以是五种选择器之一 print(soup.find_all('a',class_='sister')) #查找类为sister的a标签 print(soup.find_all('a',class_='sister ssss')) #查找类为sister和sss的a标签,顺序错误也匹配不成功 print(soup.find_all(class_=re.compile('^sis'))) #查找类为sister的所有标签 #2.4、attrs print(soup.find_all('p',attrs={'class':'story'})) #2.5、text: 值可以是:字符,列表,True,正则 print(soup.find_all(text='Elsie')) print(soup.find_all('a',text='Elsie')) #2.6、limit参数:如果文档树很大那么搜索会很慢.如果我们不需要全部结果,可以使用 limit 参数限制返回结果的数量.效果与SQL中的limit关键字类似,当搜索到的结果数量达到 limit 的限制时,就停止搜索返回结果 print(soup.find_all('a',limit=2)) #2.7、recursive:调用tag的 find_all() 方法时,Beautiful Soup会检索当前tag的所有子孙节点,如果只想搜索tag的直接子节点,可以使用参数 recursive=False . print(soup.html.find_all('a')) print(soup.html.find_all('a',recursive=False)) ''' ''' 像调用 find_all() 一样调用tag find_all() 几乎是Beautiful Soup中最常用的搜索方法,所以我们定义了它的简写方法. BeautifulSoup 对象和 tag 对象可以被当作一个方法来使用,这个方法的执行结果与调用这个对象的 find_all() 方法相同,下面两行代码是等价的: soup.find_all("a") soup("a") 这两行代码也是等价的: soup.title.find_all(text=True) soup.title(text=True) ''' ''' find() 方法 #3、find( name , attrs , recursive , text , **kwargs ) find_all() 方法将返回文档中符合条件的所有tag,尽管有时候我们只想得到一个结果.比如文档中只有一个<body>标签,那么使用 find_all() 方法来查找<body>标签就不太合适, 使用 find_all 方法并设置 limit=1 参数不如直接使用 find() 方法.下面两行代码是等价的: soup.find_all('title', limit=1) # [<title>The Dormouse's story</title>] soup.find('title') # <title>The Dormouse's story</title> 唯一的区别是 find_all() 方法的返回结果是值包含一个元素的列表,而 find() 方法直接返回结果. find_all() 方法没有找到目标是返回空列表, find() 方法找不到目标时,返回 None . print(soup.find("nosuchtag")) # None soup.head.title 是 tag的名字 方法的简写.这个简写的原理就是多次调用当前tag的 find() 方法:链式编程? soup.head.title # <title>The Dormouse's story</title> soup.find("head").find("title") # <title>The Dormouse's story</title> ''' # beautiful soup官网 # https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html#find-parents-find-parent #该模块提供了select方法来支持css,详见官网:https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html#id37 html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"> <b>The Dormouse's story</b> Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"> <span>Elsie</span> </a> <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; <div class='panel-1'> <ul class='list' id='list-1'> <li class='element'>Foo</li> <li class='element'>Bar</li> <li class='element'>Jay</li> </ul> <ul class='list list-small' id='list-2'> <li class='element'><h1 class='yyyy'>Foo</h1></li> <li class='element xxx'>Bar</li> <li class='element'>Jay</li> </ul> </div> and they lived at the bottom of a well. </p> <p class="story">...</p> """ ''' from bs4 import BeautifulSoup soup=BeautifulSoup(html_doc,'lxml') #1、CSS选择器 print('.sister',soup.p.select('.sister')) print('.sister span',soup.select('.sister span')) print('#link1',soup.select('#link1')) print('#link1 span',soup.select('#link1 span')) print('#list-2 .element.xxx',soup.select('#list-2 .element.xxx')) print('#list-2','.element',soup.select('#list-2')[0].select('.element')) #可以一直select,但其实没必要,一条select就可以了 # 2、获取属性 print('#list-2 h1',soup.select('#list-2 h1')[0].attrs) # 3、获取内容 print('#list-2 h1',soup.select('#list-2 h1')[0].get_text()) '''
爬虫使用并发提高效率
1.同步调用,不使用并发
# 同步调用 import requests def parse_page(res): print('解析 %s' %(len(res))) def get_page(url): print('下载 %s' %url) response=requests.get(url) if response.status_code == 200: return response.text urls=['https://www.baidu.com/','http://www.sina.com.cn/','https://www.python.org'] for url in urls: res=get_page(url) #调用一个任务,就在原地等待任务结束拿到结果后才继续往后执行 parse_page(res)
2.使用多进程(线程)
#IO密集型程序应该用多线程 # 多进程或多线程 import requests from threading import Thread,current_thread def parse_page(res): print('%s 解析 %s' %(current_thread().getName(),len(res))) # current_thread().getName()获取当前线程的名字 def get_page(url,callback=parse_page): print('%s 下载 %s' %(current_thread().getName(),url)) response=requests.get(url) if response.status_code == 200: callback(response.text) if __name__ == '__main__': urls=['https://www.baidu.com/','http://www.sina.com.cn/','https://www.python.org'] for url in urls: t=Thread(target=get_page,args=(url,)) t.start() # Thread-1 下载 https://www.baidu.com/ # Thread-2 下载 http://www.sina.com.cn/ # Thread-3 下载 https://www.python.org # Thread-1 解析 2443 # Thread-2 解析 570061 # Thread-3 解析 48823
3.使用进程池(线程池)
#IO密集型程序应该用多线程,所以此时我们使用线程池 # 进程池或线程池:异步调用+回调机制 import requests from threading import current_thread from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor def parse_page(res): res=res.result() # 这个result()是线程池的回调函数中的参数是个对象需要使用result来得到线程的返回值 print('%s 解析 %s' %(current_thread().getName(),len(res))) def get_page(url): print('%s 下载 %s' %(current_thread().getName(),url)) response=requests.get(url) if response.status_code == 200: return response.text if __name__ == '__main__': urls=['https://www.baidu.com/','http://www.sina.com.cn/','https://www.python.org'] pool=ThreadPoolExecutor(50) # pool=ProcessPoolExecutor(50) for url in urls: pool.submit(get_page,url).add_done_callback(parse_page) pool.shutdown(wait=True)
4.使用asyncio模块,可以帮我们检测IO(只能是网络IO),实现应用程序级别的切换
# asycio基本使用,可以帮助我们检测网络IO import asyncio @asyncio.coroutine def task(task_id,senconds): print('%s is start' %task_id) yield from asyncio.sleep(senconds) #只能检测网络IO,检测到IO后切换到其他任务执行 print('%s is end' %task_id) tasks=[task(task_id="任务1",senconds=3),task("任务2",2),task(task_id="任务3",senconds=1)] loop=asyncio.get_event_loop() loop.run_until_complete(asyncio.wait(tasks)) loop.close() # 任务2 is start 不知道为什么总是tasks列表中中间的那个先执行,调换了也是中间的先执行 # 任务1 is start # 任务3 is start # 任务3 is end # 任务2 is end # 任务1 is end
5.asyncio模块只能发tcp级别的请求,不能发http协议,因此,在我们需要发送http请求的时候,需要我们自定义http报头
# asyncio模块只能发tcp级别的请求,不能发http协议,因此,在我们需要发送http请求的时候,需要我们自定义http报头 # asyncio+自定义http协议报头 import asyncio import requests import uuid user_agent='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0' def parse_page(host,res): print('%s 解析结果 %s' %(host,len(res))) with open('%s.html' %(uuid.uuid1()),'wb') as f: # uuid.uuid1() 基于MAC地址,时间戳,随机数来生成唯一的uuid,可以保证全球范围内的唯一性。详见博客:https://www.cnblogs.com/franknihao/p/7307224.html f.write(res) @asyncio.coroutine def get_page(host,port=80,url='/',callback=parse_page,ssl=False): # 'www.baidu.com',url='/s?wd=美女',ssl=True print('下载 http://%s:%s%s' %(host,port,url)) #步骤一(IO阻塞):发起tcp链接,是阻塞操作,因此需要yield from if ssl: port=443 recv,send=yield from asyncio.open_connection(host=host,port=443,ssl=ssl) # 什么是yield from呢? 详见博客:https://blog.csdn.net/qq_27825451/article/details/78847473 # # 简单地说,yield from generator 。实际上就是返回另外一个生成器的值。如下所示: # # # def generator2(): # yield 'a' # yield 'b' # yield 'c' # yield from generator1() #yield from iterable本质上等于 for item in iterable: yield item的缩写版 # yield from [11,22,33,44] # yield from (12,23,34) # yield from range(3) # 步骤二:封装http协议的报头,因为asyncio模块只能封装并发送tcp包,因此这一步需要我们自己封装http协议的包 request_headers="""GET %s HTTP/1.0\r\nHost: %s\r\nUser-agent: %s\r\n\r\n""" %(url,host,user_agent) # 请求头 # requset_headers="""POST %s HTTP/1.0\r\nHost: %s\r\n\r\nname=egon&password=123""" % (url, host,) request_headers=request_headers.encode('utf-8') # 步骤三(IO阻塞):发送http请求包 send.write(request_headers) yield from send.drain() # 步骤四(IO阻塞):接收响应头 while True: line=yield from recv.readline() if line == b'\r\n': break print('%s Response headers:%s' %(host,line)) # 步骤五(IO阻塞):接收响应体 text=yield from recv.read() # 步骤六:执行回调函数 callback(host,text) # 步骤七:关闭套接字 send.close() #没有recv.close()方法,因为是四次挥手断链接,双向链接的两端,一端发完数据后执行send.close()另外一端就被动地断开 if __name__ == '__main__': tasks=[ get_page('www.baidu.com',url='/s?wd=美女',ssl=True), get_page('www.cnblogs.com',url='/',ssl=True), ] loop=asyncio.get_event_loop() loop.run_until_complete(asyncio.wait(tasks)) loop.close()
6.使用aiohttp模块,专门帮我们封装http报头,然后我们还需要用asyncio检测IO实现切换
# 这段代码报错了,不知道是为什么 # asyncio+aiohttp import aiohttp import asyncio @asyncio.coroutine def get_page(url): print('GET:%s' %url) response=yield from aiohttp.request('GET',url) data=yield from response.read() print(url,data) response.close() return 1 tasks=[ get_page('https://www.python.org/doc'), get_page('https://www.cnblogs.com/linhaifeng'), get_page('https://www.openstack.org') ] loop=asyncio.get_event_loop() results=loop.run_until_complete(asyncio.gather(*tasks)) loop.close() print('=====>',results) #[1, 1, 1]
7.在上面的基础上将requests.get函数传给asyncio,就能够被检测了
# asyncio+requests模块的方法 import requests import asyncio @asyncio.coroutine def get_page(func,*args): # requests.get,'https://www.python.org/doc' print('GET:%s' %args[0]) loog=asyncio.get_event_loop() furture=loop.run_in_executor(None,func,*args) response=yield from furture print(response.url,len(response.text)) return 1 tasks=[ get_page(requests.get,'https://www.python.org/doc'), get_page(requests.get,'https://www.cnblogs.com/linhaifeng'), get_page(requests.get,'https://www.openstack.org') ] loop=asyncio.get_event_loop() results=loop.run_until_complete(asyncio.gather(*tasks)) loop.close() print('=====>',results) #[1, 1, 1]
8.gevent+requests使用协程
# gevent+requests from gevent import monkey;monkey.patch_all() import gevent import requests def get_page(url): print('GET:%s' %url) response=requests.get(url) print(url,len(response.text)) return 1 # g1=gevent.spawn(get_page,'https://www.python.org/doc') # g2=gevent.spawn(get_page,'https://www.cnblogs.com/linhaifeng') # g3=gevent.spawn(get_page,'https://www.openstack.org') # gevent.joinall([g1,g2,g3,]) # print(g1.value,g2.value,g3.value) #拿到返回值 #协程池 from gevent.pool import Pool pool=Pool(2) g1=pool.spawn(get_page,'https://www.python.org/doc') g2=pool.spawn(get_page,'https://www.cnblogs.com/linhaifeng') g3=pool.spawn(get_page,'https://www.openstack.org') gevent.joinall([g1,g2,g3,]) print(g1.value,g2.value,g3.value) #拿到返回值
9.使用封装了gevent+requests模块的grequests模块
import grequests request_list=[ grequests.get('https://wwww.xxxx.org/doc1'), grequests.get('https://www.cnblogs.com/linhaifeng'), grequests.get('https://www.openstack.org') ] ##### 执行并获取响应列表 ##### # response_list = grequests.map(request_list) # print(response_list) ##### 执行并获取响应列表(处理异常) ##### def exception_handler(request, exception): # print(request,exception) print("%s Request failed" %request.url) response_list = grequests.map(request_list, exception_handler=exception_handler) print(response_list,111) # https://wwww.xxxx.org/doc1 Request failed # [None, <Response [200]>, <Response [200]>] 111 reponse_list是个列表,元素是响应对象
10.twisted:是一个网络框架,其中一个功能是发送异步请求,检测IO并自动切换
'''
#问题一:error: Microsoft Visual C++ 14.0 is required. Get it with "Microsoft Visual C++ Build Tools": http://landinghub.visualstudio.com/visual-cpp-build-tools https://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted pip3 install C:\Users\Administrator\Downloads\Twisted-17.9.0-cp36-cp36m-win_amd64.whl pip3 install twisted #问题二:ModuleNotFoundError: No module named 'win32api' https://sourceforge.net/projects/pywin32/files/pywin32/ #问题三:openssl pip3 install pyopenssl ''' #twisted基本用法 from twisted.web.client import getPage,defer from twisted.internet import reactor def all_done(arg): # print(arg) reactor.stop() def callback(res): print(res) return 1 defer_list=[] urls=[ 'http://www.baidu.com', 'http://www.bing.com', 'https://www.python.org', ] for url in urls: obj=getPage(url.encode('utf=-8'),) obj.addCallback(callback) defer_list.append(obj) defer.DeferredList(defer_list).addBoth(all_done) reactor.run() #twisted的getPage的详细用法 from twisted.internet import reactor from twisted.web.client import getPage import urllib.parse def one_done(arg): print(arg) reactor.stop() post_data = urllib.parse.urlencode({'check_data': 'adf'}) post_data = bytes(post_data, encoding='utf8') headers = {b'Content-Type': b'application/x-www-form-urlencoded'} response = getPage(bytes('http://dig.chouti.com/login', encoding='utf8'), method=bytes('POST', encoding='utf8'), postdata=post_data, cookies={}, headers=headers) response.addBoth(one_done) reactor.run()
11.tornado
from tornado.httpclient import AsyncHTTPClient from tornado.httpclient import HTTPRequest from tornado import ioloop def handle_response(response): """ 处理返回值内容(需要维护计数器,来停止IO循环),调用 ioloop.IOLoop.current().stop() :param response: :return: """ if response.error: print("Error:", response.error) else: print(response.body) def func(): url_list = [ 'http://www.baidu.com', 'http://www.bing.com', ] for url in url_list: print(url) http_client = AsyncHTTPClient() http_client.fetch(HTTPRequest(url), handle_response) ioloop.IOLoop.current().add_callback(func) ioloop.IOLoop.current().start() #发现上例在所有任务都完毕后也不能正常结束,为了解决该问题,让我们来加上计数器 from tornado.httpclient import AsyncHTTPClient from tornado.httpclient import HTTPRequest from tornado import ioloop count=0 def handle_response(response): """ 处理返回值内容(需要维护计数器,来停止IO循环),调用 ioloop.IOLoop.current().stop() :param response: :return: """ if response.error: print("Error:", response.error) else: print(len(response.body)) global count count-=1 #完成一次回调,计数减1 if count == 0: ioloop.IOLoop.current().stop() def func(): url_list = [ 'http://www.baidu.com', 'http://www.bing.com', ] global count for url in url_list: print(url) http_client = AsyncHTTPClient() http_client.fetch(HTTPRequest(url), handle_response) count+=1 #计数加1 ioloop.IOLoop.current().add_callback(func) ioloop.IOLoop.current().start()