day5

最新推荐文章于 2024-11-02 11:13:42 发布

zzm_

最新推荐文章于 2024-11-02 11:13:42 发布

阅读量218

点赞数

文章标签：测试爬虫 python

原文链接：http://www.cnblogs.com/xm123456/p/11042358.html

版权

上周作业

 1 ''''''
 2 '''
 3 爬取豆瓣TOP250电影信息
 4 
 5 主页:
 6     第一页:
 7         https://movie.douban.com/top250?start=0&filter=
 8     第二页:
 9         https://movie.douban.com/top250?start=25&filter=
10     第三页:
11         https://movie.douban.com/top250?start=50&filter=
12     第四页:
13         https://movie.douban.com/top250?start=75&filter=
14     第十页:
15         https://movie.douban.com/top250?start=225&filter=
16         
17     GET
18     User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36
19 
20 re正则:
21     # 电影详情页url、图片链接、电影名称、导演、主演、电影上映时间、电影评分、评价人数、简介
22    <div class="item">.*?href="(.*?)">.*?src="(.*?)" class="">.*?<span class="title">(.*?)</span>.*?<div class="bd">.*?导演:(.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价.*?<span class="inq">(.*?)</span>
23 '''
24 import requests
25 import re
26 
27 
28 headers = {
29     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
30 }
31 base_url = 'https://movie.douban.com/top250?start={}&filter='
32 
33 n = 0
34 for line in range(10):
35     url = base_url.format(n)
36     print(type(n))
37     n += 25
38     print(url)
39 
40     # 1、往豆瓣TOP250发送请求获取响应数据
41     response = requests.get(url, headers=headers)
42 
43     # print(response.text)
44 
45     # 2、通过正则解析提取数据
46     # 电影详情页url、图片链接、电影名称、电影评分、评价人数
47     movie_content_list = re.findall(
48         # 正则规则
49         # '<div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价',
50         '<div class="item">.*?href="(.*?)">.*?src="(.*?)" class="">.*?<span class="title">(.*?)</span>.*?<div class="bd">.*?导演:(.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价.*?<span class="inq">(.*?)</span>',
51 
52         # 解析文本
53         response.text,
54 
55         # 匹配模式
56         re.S)
57 
58     for movie_content in movie_content_list:
59         # 解压赋值每一部电影
60         detail_url, movie_jpg, name, daoyan, timer,point, num,  desc= movie_content
61         data = f'电影名称:{name},   详情页url:{detail_url}, 图片url:{movie_jpg}, 导演: {daoyan} 上映时间: {timer}评分: {point}, 评价人数: {num} 简介:{desc}\n'
62         print(data)
63 
64         # 3、保存数据，把电影信息写入文件中
65         with open('douban.txt', 'a', encoding='utf-8') as f:
66             f.write(data)

requests之post请求

 1 ''''''
 2 '''
 3 post请求登陆github
 4 '''
 5 import requests
 6 import re
 7 
 8 # 一 访问login页获取token信息
 9 '''
10 请求url:
11     https://github.com/login
12 请求方式:   
13     GET
14 响应头:
15     Set-Cookie
16 请求头:
17     Cookie
18     User-Agent
19 '''
20 headers = {
21     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
22 }
23 
24 response = requests.get(url='https://github.com/login', headers=headers)
25 # print(response.text)
26 # 把login页返回的cookies信息转换成字典
27 login_cookies = response.cookies.get_dict()
28 
29 authenticity_token = re.findall('<input type="hidden" name="authenticity_token" value="(.*?)" />', response.text, re.S)[0]
30 
31 print(authenticity_token)
32 
33 
34 
35 # 二 往sessionurl发送POST请求
36 '''
37 
38 请求url:
39     https://github.com/session
40     
41 请求方式:
42     POST
43     
44 请求头:
45     # 上一次请求从哪里来
46     Referer: https://github.com/login
47     Cookie:...
48     User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36
49     
50 请求体:
51     只有POST请求才会有请求体。
52     commit: Sign in
53     utf8: ✓
54     authenticity_token: 
55     VX79esFc0YPdR1UFzUM/6MTRZOlYQ0btF5k2/x7uZea0x2E6W4bmRpwHsaCBN+096PaWNkcQjJOsyUzUqsAhIw==
56     LLWlTr0qLcYC74hn7OI7IlyeB9rZei9737Lqtzz0sKTgY7Js7pUUhZ6bNC6lCkS+OHfVukkbTejjd0BnjPvGUg==
57     login: tankjam1
58     password: *****
59     webauthn-support: unsupported
60 '''
61 # 拼接请求头信息
62 headers2 = {
63     'Referer': 'https://github.com/login',
64     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36',
65 }
66 
67 # 拼接请求体信息
68 form_data = {
69     "commit": "Sign in",
70     "utf8": "✓",
71     "authenticity_token": authenticity_token,
72     "login": "tankjam",
73     "password": "kermit46709394",
74     "webauthn-support": "unsupported",
75 }
76 
77 # 往session地址发送post请求
78 # 携带请求头、请求体、login页的cookies信息
79 response2 = requests.post(url='https://github.com/session', data=form_data, headers=headers2, cookies=login_cookies)
80 print(response2.status_code)
81 # print(response2.text)
82 with open('github.html', 'w', encoding='utf-8') as f:
83     f.write(response2.text)



requests响应

 1 # import requests
 2 #
 3 # response = requests.get('https://baidu.com')
 4 # # response响应
 5 # print(response.status_code)  # 获取响应状态码
 6 # print(response.url)  # 获取url地址
 7 # print(response.encoding)  # 字符编码
 8 # response.encoding = 'utf-8'
 9 # print(response.text)  # 获取文本
10 # print(response.content)  # 获取二进制流
11 # print(response.headers)  # 获取页面请求头信息
12 # print(response.history)  # 上一次跳转的地址
13 # # 1、返回cookie字典 2、返回cookies对象
14 # print(response.cookies)  # 获取cookies信息,
15 # print(response.cookies.get_dict())  # 获取cookies信息转换成字典
16 # print(response.cookies.items())  # 获取cookies信息转换成字典
17 # print(response.encoding)
18 # print(response.elapsed)  # 访问时间
19 
20 # import requests
21 # # 往音频地址发送get请求
22 # url = 'https://vd3.bdstatic.com/mda-ic4pfhh3ex32svqi/hd/mda-ic4pfhh3ex32svqi.mp4?auth_key=1557973824-0-0-bfb2e69bb5198ff65e18065d91b2b8c8&bcevod_channel=searchbox_feed&pd=wisenatural&abtest=all.mp4'
23 # response = requests.get(url, stream=True)  # stream=True 把content设置为一个迭代器对象
24 # print(response.content)
25 #
26 # with open('love_for_GD.mp4', 'wb') as f:
27 #     for content in response.iter_content():
28 #         f.write(content)




requests高级用法

  1 '''
  2 '''
  3 '''
  4 证书验证(大部分网站都是https)
  5 '''
  6 import requests
  7 # # 如果是ssl请求,首先检查证书是否合法,不合法则报错,程序终端
  8 # response = requests.get('https://www.xiaohuar.com')
  9 # print(response.status_code)
 10 
 11 # 改进1:去掉报错,但是会报警告
 12 # import requests
 13 # response = requests.get('https://www.xiaohuar.com', verify=False)
 14 # # 不验证证书,报警告,返回200
 15 # print(response.status_code)
 16 
 17 # 改进2:去掉报错,并且去掉警报信息
 18 # import requests
 19 # import urllib3
 20 # urllib3.disable_warnings()  # 关闭警告
 21 # response = requests.get('https://www.xiaohuar.com', verify=False)
 22 # print(response.status_code)
 23 
 24 # 改进3:加上证书
 25 # 很多网站都是https,但是不用证书也可以访问,大多数情况都是可以携带也可以不携带证书
 26 # 知乎\百度等都是可带可不带
 27 # 有硬性要求的,则必须带，比如对于定向的用户,拿到证书后才有权限访问某个特定网站
 28 # import requests
 29 # import urllib3
 30 # # urllib3.disable_warnings()  # 关闭警告
 31 # # 伪代码
 32 # response = requests.get(
 33 #     'https://www.xiaohuar.com',
 34 #     # verify=False,
 35 #     # /path/server.crt证书的存放目录， /path/key
 36 #     cert=('/path/server.crt', '/path/key'))
 37 # print(response.status_code)
 38 
 39 
 40 '''
 41 超时设置
 42 '''
 43 
 44 # 超时设置
 45 # 两种超时:float or tuple
 46 # timeout=0.1  # 代表接收数据的超时时间
 47 # timeout=(0.1,0.2)  # 0.1代表链接超时  0.2代表接收数据的超时时间
 48 
 49 # import requests
 50 # response = requests.get('https://www.baidu.com',
 51 #                         timeout=0.0001)
 52 # # print(response.elapsed)
 53 # print(response.status_code)
 54 
 55 '''
 56 代理设置:先发送请求给代理,然后由代理帮忙发送(封ip是常见的事情)
 57 '''
 58 # import requests
 59 # proxies={
 60 #     # 带用户名密码的代理,@符号前是用户名与密码
 61 #     'http':'http://tank:123@localhost:9527',
 62 #     'http':'http://localhost:9527',
 63 #     'https':'https://localhost:9527',
 64 # }
 65 # response=requests.get('https://www.12306.cn',
 66 #                      proxies=proxies)
 67 #
 68 # print(response.status_code)
 69 '''
 70 爬取西刺免费代理：
 71     1.访问西刺免费代理页面
 72     2.通过re模块解析并提取所有代理
 73     3.通过ip测试网站对爬取的代理进行测试
 74     4.若test_ip函数抛出异常代表代理作废，否则代理有效
 75     5.利用有效的代理进行代理测试
 76 
 77 <tr class="odd">
 78       <td class="country"><img src="//fs.xicidaili.com/images/flag/cn.png" alt="Cn"></td>
 79       <td>112.85.131.99</td>
 80       <td>9999</td>
 81       <td>
 82         <a href="/2019-05-09/jiangsu">江苏南通</a>
 83       </td>
 84       <td class="country">高匿</td>
 85       <td>HTTPS</td>
 86       <td class="country">
 87         <div title="0.144秒" class="bar">
 88           <div class="bar_inner fast" style="width:88%">
 89 
 90           </div>
 91         </div>
 92       </td>
 93       <td class="country">
 94         <div title="0.028秒" class="bar">
 95           <div class="bar_inner fast" style="width:97%">
 96 
 97           </div>
 98         </div>
 99       </td>
100 
101       <td>6天</td>
102       <td>19-05-16 11:20</td>
103     </tr>
104 re:
105     <tr class="odd">(.*?)</td>.*?<td>(.*?)</td>
106 
107 '''
108 # import requests
109 # import re
110 # import time
111 #
112 # HEADERS = {
113 #     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
114 # }
115 #
116 #
117 # def get_index(url):
118 #     time.sleep(1)
119 #     response = requests.get(url, headers=HEADERS)
120 #     return response
121 #
122 #
123 # def parse_index(text):
124 #     ip_list = re.findall('<tr class="odd">.*?<td>(.*?)</td>.*?<td>(.*?)</td>', text, re.S)
125 #     for ip_port in ip_list:
126 #         ip = ':'.join(ip_port)
127 #         yield ip
128 #
129 # def test_ip(ip):
130 #     print('测试ip: %s' % ip)
131 #     try:
132 #         proxies = {
133 #             'https': ip
134 #         }
135 #
136 #         # ip测试网站
137 #         ip_url = 'https://www.ipip.net/'
138 #
139 #         # 使用有效与无效的代理对ip测试站点进行访问，若返回的结果为200则代表当前测试ip正常
140 #         response = requests.get(ip_url, headers=HEADERS, proxies=proxies, timeout=1)
141 #
142 #         if response.status_code == 200:
143 #             print(f'有用的ip：{ip}')
144 #             return ip
145 #
146 #     # 若ip代理无效则抛出异常
147 #     except Exception as e:
148 #         print(e)
149 #
150 # # 使用代理爬取nba
151 # def spider_nba(good_ip):
152 #     url = 'https://china.nba.com/'
153 #
154 #     proxies = {
155 #         'https': good_ip
156 #     }
157 #
158 #     response = requests.get(url, headers=HEADERS, proxies=proxies)
159 #     print(response.status_code)
160 #     print(response.text)
161 #
162 #
163 # if __name__ == '__main__':
164 #     base_url = 'https://www.xicidaili.com/nn/{}'
165 #
166 #     for line in range(1, 3677):
167 #         ip_url = base_url.format(line)
168 #
169 #         response = get_index(ip_url)
170 #
171 #         # 解析西刺代理获取每一个ip列表
172 #         ip_list = parse_index(response.text)
173 #
174 #         # 循环每一个ip
175 #         for ip in ip_list:
176 #             # print(ip)
177 #
178 #             # 对爬取下来的ip进行测试
179 #             good_ip = test_ip(ip)
180 #
181 #             if good_ip:
182 #                 # 真是代理，开始测试
183 #                 spider_nba(good_ip)
184 
185 
186 
187 '''
188 认证设置
189 '''
190 import requests
191 # 通过访问github的api来测试
192 url = 'https://api.github.com/user'
193 HEADERS = {
194     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
195 }
196 
197 # 测试1，失败返回401
198 # response = requests.get(url, headers=HEADERS)
199 # print(response.status_code)  # 401
200 # print(response.text)
201 '''
202 打印结果:
203     {
204       "message": "Requires authentication",
205       "documentation_url": "https://developer.github.com/v3/users/#get-the-authenticated-user"
206     }
207 '''
208 #
209 # # 测试2，通过requests.auth内的HTTPBasicAuth进行认证，认证成功返回用户信息
210 # from requests.auth import HTTPBasicAuth
211 # response = requests.get(url, headers=HEADERS, auth=HTTPBasicAuth('tankjam', 'kermit46709394'))
212 # print(response.text)
213 #
214 
215 # 测试3，通过requests.get请求内的auth参数默认就是HTTPBasicAuth，认证成功返回用户信息
216 # response = requests.get(url, headers=HEADERS, auth=('tankjam', 'kermit46709394'))
217 # print(response.text)
218 
219 
220 '''
221 上传文件
222 '''
223 import requests
224 
225 # 上传文本文件
226 # files1 = {'file': open('user.txt', 'rb')}
227 # # files参数是POST请求固定参数
228 # response = requests.post('http://httpbin.org/post', files=files1)
229 # print(response.status_code)  # 200
230 # print(response.text)  # 200
231 
232 # 上传图片文件
233 # files2 = {'jpg': open('一拳.jpg', 'rb')}
234 # response = requests.post('http://httpbin.org/post', files=files2)
235 # print(response.status_code)  # 200
236 # print(response.text)  # 200
237 #
238 # 上传视频文件
239 # files3 = {'movie': open('love_for_GD.mp4', 'rb')}
240 # response = requests.post('http://httpbin.org/post', files=files3)
241 # print(response.status_code)  # 200
242 # print(response.text)  # 200

selenium初级使用

  1 ''''''
  2 '''
  3 selenium模块讲解
  4 一 什么是selenium？
  5     最初是一个自动化测试工具。可以使用它帮我们驱动浏览器
  6     自动去执行某些自定义好的操作。例如在页面中执行JS代码、
  7     跳过登录验证。可以使用selenium帮我们实现爬虫。
  8     
  9 二 为什么要使用selenium？
 10     1、优点:
 11         使用requests模块登录需要分析大量的复杂通信流程，使用selenium
 12     可以轻松跳过登录验证。
 13     
 14     2、缺点:
 15         浏览器会加载css、js、图片、视频...数据，爬虫效率相比requests模块要低。
 16         
 17 三 如何使用selenium？
 18     下载selenium模块：
 19         pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple selenium
 20     下载浏览器驱动:
 21         http://npm.taobao.org/mirrors/chromedriver/2.38/
 22 '''
 23 
 24 # selenium之第一次
 25 from selenium import webdriver  # 用来驱动浏览器的
 26 
 27 # 调用得到一个动作链对象，破解滑动验证码的时候用的，可以拖动图片
 28 from selenium.webdriver import ActionChains
 29 
 30 # 按照什么方式查找属性，By.ID,  By.CSS_SELECTOR， By.Class
 31 from selenium.webdriver.common.by import By
 32 
 33 from selenium.webdriver.common.keys import Keys  # 键盘按键操作
 34 
 35 # 和下面WebDriverWait一起用的，EC是expected_conditions的别名
 36 from selenium.webdriver.support import expected_conditions as EC
 37 
 38 # 等待页面加载某些元素
 39 from selenium.webdriver.support.wait import WebDriverWait
 40 import time
 41 
 42 # 通过谷歌浏览器驱动打开谷歌浏览器
 43 # webdriver.Chrome(r'chromedriver.exe的绝对路径')
 44 # chrome = webdriver.Chrome(r'D:\BaiduNetdiskDownload\chromedriver_win32\chromedriver.exe')  # 括号内输入chromedriver.exe的绝对路径
 45 
 46 # chromedriver.exe存放于python解释器的Scripts文件夹中
 47 
 48 # chrome是一个驱动对象
 49 chrome = webdriver.Chrome()
 50 
 51 '''
 52 实例1
 53 '''
 54 # 若try出现异常
 55 # try:
 56 #     # 往tank博客主页发送get请求
 57 #     # chrome.get('https://www.cnblogs.com/kermitjam/')
 58 #
 59 #     # 参数1: 驱动对象  参数2: 等待时间
 60 #     wait = WebDriverWait(chrome, 10)
 61 #
 62 #     # 1、访问百度
 63 #     chrome.get('https://www.baidu.com/')
 64 #
 65 #     # 2、查找input输入框
 66 #     input_tag = wait.until(
 67 #         # 调用EC的presence_of_element_located()
 68 #         EC.presence_of_element_located(
 69 #             # 此处可以写一个元组
 70 #             # 参数1: 查找属性的方式
 71 #             # 参数2: 属性的名字
 72 #             (By.ID, "kw")
 73 #         )
 74 #     )
 75 #     input_tag = wait.until(EC.presence_of_element_located((By.ID, "kw")))
 76 #
 77 #     # 3、搜索一拳超人
 78 #     input_tag.send_keys('一拳超人')
 79 #
 80 #     # 4、按键盘回车键
 81 #     input_tag.send_keys(Keys.ENTER)
 82 #
 83 #     time.sleep(3)
 84 #
 85 # # 无论发生什么都会关闭浏览器
 86 # finally:
 87 #     # 关闭浏览器
 88 #     chrome.close()
 89 
 90 
 91 '''
 92 实例2
 93 '''
 94 try:
 95     # 往tank博客主页发送get请求
 96     # chrome.get('https://www.cnblogs.com/kermitjam/')
 97 
 98     # 参数1: 驱动对象  参数2: 等待时间
 99     wait = WebDriverWait(chrome, 10)
100 
101     # 1、访问京东主页
102     chrome.get('https://www.jd.com/')
103 
104     # 2、查找input输入框
105     input_tag = wait.until(EC.presence_of_element_located((By.ID, "key")))
106 
107     # 3、搜索唐诗三百首
108     input_tag.send_keys('唐诗三百首')
109 
110     # 4、根据class属性名称查找标签
111     search_button = wait.until(
112         EC.presence_of_element_located((By.CLASS_NAME, 'button')))
113     # 5、点击搜索按钮
114     search_button.click()
115 
116     time.sleep(3)
117 
118 # 无论发生什么都会关闭浏览器
119 finally:
120     # 关闭浏览器
121     chrome.close()

selenium之基本选择器

 1 # from selenium import webdriver  # 用来驱动浏览器的
 2 # import time
 3 #
 4 # '''
 5 # 隐式等待
 6 # '''
 7 # # 获取驱动对象、
 8 # driver = webdriver.Chrome()
 9 #
10 # try:
11 #     # 显式等待: 等待某个元素加载
12 #     # 参数1: 驱动对象  参数2: 等待时间
13 #     # wait = WebDriverWait(chrome, 10)
14 #
15 #     driver.get('https://china.nba.com/')
16 #
17 #     # 隐式等待: 等待页面所有元素加载
18 #     driver.implicitly_wait(10)
19 #     news_tag = driver.find_element_by_class_name('nav-news')
20 #     # 获取标签对象
21 #     print(news_tag)
22 #     # 获取标签的名字
23 #     print(news_tag.tag_name)
24 #
25 #
26 #     time.sleep(10)
27 #
28 # finally:
29 #     driver.close()
30 
31 
32 from selenium import webdriver  # 用来驱动浏览器的
33 import time
34 
35 '''
36 ===============所有方法===================
37     element是查找一个标签
38     elements是查找所有标签
39 
40     1、find_element_by_link_text  通过链接文本去找
41     2、find_element_by_id 通过id去找
42     3、find_element_by_class_name
43     4、find_element_by_partial_link_text
44     5、find_element_by_name
45     6、find_element_by_css_selector
46     7、find_element_by_tag_name
47 '''
48 # 获取驱动对象、
49 driver = webdriver.Chrome()
50 
51 try:
52 
53     # 往百度发送请求
54     driver.get('https://www.baidu.com/')
55     driver.implicitly_wait(10)
56 
57     # 1、find_element_by_link_text  通过链接文本去找
58     # 根据登录
59     # send_tag = driver.find_element_by_link_text('登录')
60     # send_tag.click()
61 
62     # 2、find_element_by_partial_link_text 通过局部文本查找a标签
63     login_button = driver.find_element_by_partial_link_text('登')
64     login_button.click()
65     time.sleep(1)
66 
67     # 3、find_element_by_class_name 根据class属性名查找
68     login_tag = driver.find_element_by_class_name('tang-pass-footerBarULogin')
69     login_tag.click()
70     time.sleep(1)
71 
72     # 4、find_element_by_name 根据name属性查找
73     username = driver.find_element_by_name('userName')
74     username.send_keys('15622792660')
75     time.sleep(1)
76 
77     # 5、find_element_by_id 通过id属性名查找
78     password = driver.find_element_by_id('TANGRAM__PSP_10__password')
79     password.send_keys('*******')
80     time.sleep(1)
81 
82     # 6、find_element_by_css_selector  根据属性选择器查找
83     # 根据id查找登录按钮
84     login_submit = driver.find_element_by_css_selector('#TANGRAM__PSP_10__submit')
85     # driver.find_element_by_css_selector('.pass-button-submit')
86     login_submit.click()
87 
88     # 7、find_element_by_tag_name  根据标签名称查找标签
89     div = driver.find_element_by_tag_name('div')
90     print(div.tag_name)
91 
92     time.sleep(10)
93 
94 finally:
95     driver.close()