python day92
代理池搭建
git clone git@github.com:jhao104/proxy_pool.git
pip install -r requirements.txt
DB_CONN = 'redis://127.0.0.1:8888/0'
python proxyPool.py schedule
python proxyPool.py server
http://127.0.0.1:5010/get
selenium的基本使用(模拟登陆百度)
from selenium import webdriver
import time
driver=webdriver.Chrome(executable_path='chromedriver.exe')
driver.get('https://www.baidu.com')
driver.implicitly_wait(10)
a_logon=driver.find_element_by_xpath('//*[@id="s-top-loginbtn"]')
a_logon.click()
username_login=driver.find_element_by_id('TANGRAM__PSP_11__footerULoginBtn')
username_login.click()
username=driver.find_element_by_id('TANGRAM__PSP_11__userName')
password=driver.find_element_by_id('TANGRAM__PSP_11__password')
username.send_keys('33333@qq.com')
password.send_keys('lqz12345')
time.sleep(3)
button=driver.find_element_by_id('TANGRAM__PSP_11__submit')
button.click()
time.sleep(3)
driver.close()
无界面浏览器
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('--headless')
driver=webdriver.Chrome(executable_path='chromedriver.exe',options=chrome_options)
driver.get('https://www.cnblogs.com/')
print(driver.page_source)
article_list=driver.find_elements_by_class_name('post-item')
for article in article_list:
title=article.find_element_by_css_selector('a.post-item-title').text
print(title)
driver.close()
selenium的其他用法
获取位置,属性,大小
a=article.find_element_by_css_selector('a.post-item-title')
title = a.text
href=a.get_attribute('href')
print(title)
print(href)
print(a.id)
print(a.tag_name)
print(a.location)
print(a.size)
等待元素被加载
1 代码访问速度很快,页面中控件还没加载出来,如果取控件,会报错
2 两种等待方式
-显示等待(当不知道),需要给每个控件都要加等待
wait=WebDriverWait(driver,10)
wait.until(EC.presence_of_element_located((By.ID,'content_left')))
-隐士等待(以后全用隐士等待)
driver.implicitly_wait(10)
元素操作
1 向input框输入值
对象.send_keys('值')
2 点击控件
对象.click()
3 清空input框中的值
对象.clear()
执行js代码
driver.execute_script('写js即可')
driver.execute_script('window.scrollBy(0,document.body.scrollHeight)')
切换选项卡
import time
from selenium import webdriver
browser=webdriver.Chrome()
browser.get('https://www.baidu.com')
browser.execute_script('window.open()')
print(browser.window_handles)
browser.switch_to.window(browser.window_handles[1])
browser.get('https://www.taobao.com')
time.sleep(1)
browser.switch_to.window(browser.window_handles[0])
browser.get('https://www.sina.com.cn')
browser.close()
browser.quit()
浏览器前进后退
import time
from selenium import webdriver
browser=webdriver.Chrome()
browser.get('https://www.baidu.com')
browser.get('https://www.taobao.com')
browser.get('http://www.sina.com.cn/')
browser.back()
time.sleep(3)
browser.forward()
browser.close()
异常处理
from selenium import webdriver
try:
browser=webdriver.Chrome()
browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
browser.switch_to.frame('iframssseResult')
except Exception as e:
print(e)
finally:
browser.close()
selenium登陆cnblogs获取cookie
from selenium import webdriver
driver = webdriver.Chrome(executable_path='chromedriver.exe')
driver.implicitly_wait(10)
driver.get('http://www.cnblogs.com')
import json
with open('cookie.json', 'r', encoding='utf-8') as f:
cookie=json.load(f)
for item in cookie:
driver.add_cookie(item)
driver.refresh()
import time
time.sleep(3)
driver.refresh()
driver.close()
抽屉半自动点赞
from selenium import webdriver
import time
import json
import requests
from requests.cookies import RequestsCookieJar
header = {
'Referer': 'https://dig.chouti.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}
res = requests.get('https://dig.chouti.com/top/24hr?_=1621483658031', headers=header).json()
jar=RequestsCookieJar()
with open('chouti.json','r',encoding='utf-8') as f:
cookie=json.load(f)
for item in cookie:
jar.set(item['name'],item['value'])
for item in res['data']:
print(item['id'])
url = 'https://dig.chouti.com/link/vote'
data = {
'linkId': item['id']
}
res_vode = requests.post(url, headers=header, data=data,cookies=jar)
print(res_vode.text)
爬取京东商品信息
from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys
import requests
def get_goods(driver):
li_list = driver.find_elements_by_class_name('gl-item')
for li in li_list:
try:
img = li.find_element_by_css_selector('.p-img img').get_attribute('src')
if not img:
img = 'https:' + li.find_element_by_css_selector('.p-img img').get_attribute('data-lazy-img')
price = li.find_element_by_css_selector('.p-price i').text
commit = li.find_element_by_css_selector('.p-commit a').text
href = li.find_element_by_css_selector('.p-img a').get_attribute('href')
name = li.find_element_by_css_selector('.p-name em').text
print('''
商品名称:%s
商品连接:%s
商品图片:%s
商品价格:%s
商品评论数:%s
''' % (name, href, img, price, commit))
res = requests.get(img)
img_name = img.rsplit('/')[-1]
with open('img/%s' % img_name, 'wb') as f:
for line in res.iter_content(1024):
f.write(line)
except Exception as e:
print(e)
continue
next = driver.find_element_by_partial_link_text('下一页')
next.click()
time.sleep(1)
driver.execute_script('window.scrollBy(0,document.body.scrollHeight)')
get_goods(driver)
driver = webdriver.Chrome(executable_path='chromedriver.exe')
driver.implicitly_wait(10)
try:
driver.get('https://www.jd.com/')
input_search = driver.find_element_by_id('key')
input_search.send_keys('商品', Keys.ENTER)
get_goods(driver)
except Exception as e:
print(e)
finally:
driver.close()
几个爬虫案例
import requests
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}
data = {
'cname': '',
'pid': 20,
'keyword': '浦东',
'pageIndex': 1,
'pageSize': 10
}
ret = requests.post('http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword', data=data, headers=header)
print(ret.json())
import requests
from bs4 import BeautifulSoup
ret=requests.get('https://www.qiushibaike.com/text/page/2/')
soup=BeautifulSoup(ret.text,'html.parser')
article_list=soup.find_all(class_='article')
for article in article_list:
content=article.find(class_='content').text
print(content)
print('-------')