import requests
import time
from re import findall
import csv
from selenium.webdriver import Chrome
from selenium import webdriver
# 获取代理ipdefget_ips():
url ='http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=6226c130427f487385ad7b5235bc603c&count=2&expiryDate=0&format=2&newLine=3'
response = requests.get(url)if response.text[0]=='{':returnNone
result =[x for x in response.text.split('\n')if x]return result
defsave_data(datas,file):withopen(file,'w', encoding='utf-8', newline='')as f:
writer = csv.writer(f)
writer.writerow(['排名','电影名称','评分'])
writer.writerows(datas)defget_douban1():
headers ={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'}# 不断获取代理ip只带取到ip位置whileTrue:
ips = get_ips()if ips:break
time.sleep(11)print(ips)# 添加代理
proxies ={'http': ips[0],'https': ips[1]}
response = requests.get('https://movie.douban.com/top250', headers=headers, proxies=proxies)if response.status_code ==200:
re_str = r'(?s)<li>\s*<div class="item">.+?alt="(.+?)".+?"v:average">(.+?)</span>.+?</li>'
result = findall(re_str, response.text)
new_result =[[index+1,*result[index]]for index inrange(len(result))]
save_data(new_result,'豆瓣电影.csv')else:print('获取数据失败')defget_douban2():
headers ={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'}
start =0whileTrue:
url = f'https://movie.douban.com/top250?start={start}'
response = requests.get(url, headers=headers)if response.status_code ==200:
start +=25if start ==275:break
lelenium设置代理
from selenium.webdriver import Chrome
from selenium.webdriver import ChromeOptions
import requests
# 获取代理ipdefget_ips():
url ='http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=6226c130427f487385ad7b5235bc603c&count=2&expiryDate=0&format=2&newLine=3'
response = requests.get(url)if response.text[0]=='{':returnNone
result =[x for x in response.text.split('\n')if x]return result
ips = get_ips()if ips:
options = ChromeOptions()# 添加代理# option.add_argument('--proxy-server=http://代理ip:端口')
options.add_argument(f'--proxy-server=http://{ips[0]}')
b = Chrome(options=options)
b.get('https://movie.douban.com/top250')else:print('获取ip失败!')
selenium控制页面滚动
from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
import time
b = Chrome()
b.get('https://www.jd.com/')
search = b.find_element_by_css_selector('#key')
search.send_keys('手机')
search.send_keys(Keys.ENTER)
time.sleep(1)# 一步到位,滚动到底部# b.execute_script('window.scrollTo(0, document.body.scrollHeight)')# 一点一点的滚动: 500 -> 修改成每次需要滚动的距离,单位是像素; 1000 -> 每次滚动的时间间隔,单位是毫秒
b.execute_script("""
height = 500
t = setInterval(function(){
if (height > document.body.scrollHeight){
clearInterval(t)
}
window.scrollTo(0, height)
height += 500
}, 1000)
""")