抓取淘宝美食数据案例
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import re
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 5)
def next_page(page):
print("正在切换", page, "页")
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager div div div div.form input")))
input.clear()
input.send_keys(page)
submit = driver.find_element_by_css_selector("#mainsrp-pager div div div div.form span.btn.J_Submit")
submit.click()
wait.until(
EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#mainsrp-pager div div div ul li.item.active"), str(page)))
get_product_info(page)
def get_product_info(page):
print("正在解析", page, "页")
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-itemlist .items .item")))
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
product_lists = soup.select("#mainsrp-itemlist .items .item")
for product in product_lists:
item = {}
location = product.select(".location")[0].text
shopname = product.select(".shopname")[0].text
title = product.select(".title .J_ClickStat")[0].text
image = product.select("img")[0]["data-src"]
data_link = product.select(".pic-link.J_ClickStat.J_ItemPicA")[0]["href"]
item["location"] = location
item["shopname"] = shopname
item["title"] = title
item["image"] = image
item["data_link"] = data_link
print(item)
def get_total_pagel():
driver.get("https://www.taobao.com/")
input = wait.until(EC.presence_of_element_located((By.ID, "q")))
input.send_keys("美食")
driver.find_element_by_css_selector(".btn-search").click()
total = driver.find_element_by_class_name("total").text
total_num = re.compile(r'\d+').search(total).group()
get_product_info(1)
return total_num
if __name__ == '__main__':
toto_page = get_total_pagel()
for page in range(2, int(toto_page) + 1):
next_page(page)
driver.quit()
尝试对验证码进行机器识别处理--登录知乎案例
import time
from selenium import webdriver
import base64
from PIL import Image
from pytesseract import image_to_string
def get_image_text():
reuslt_text = None
try:
image = Image.open("captcha.jpg")
reuslt_text = image_to_string(image)
print("reuslt_text:", reuslt_text)
except Exception as e:
print(e)
if reuslt_text == None:
reuslt_text = input("请输入验证码:")
return reuslt_text
# 保存验证码
def save_captcha(base64data):
base64data = base64data[len("data:image/jpg;base64,"):].replace("
", "").replace("%0A", "")
print("image_data==", base64data)
image_data = base64.b64decode(base64data)
with open("captcha.jpg", "wb") as file:
file.write(image_data)
print("验证保存成功!")
def zhihulogin():
driver = webdriver.Chrome()
# 进入登录页面
driver.get("https://www.zhihu.com/signup?next=%2F")
time.sleep(2)
# 点击切换到登录页面
driver.find_element_by_xpath('//div[@class="SignContainer-switch"]/span').click()
# 输入账号
driver.find_element_by_name("username").send_keys("账号")
# 输入密码
driver.find_element_by_name("password").send_keys("密码")
driver.save_screenshot("输入账号和密码完毕.png")
if driver.page_source.find("Captcha-englishContainer") != -1:
src = driver.find_element_by_class_name("Captcha-englishImg").get_attribute("src")
print(src)
if len(src) > len("data:image/jpg;base64,null"):
# 英文验证码出现
print("英文验证码出现")
save_captcha(src)
# 识别验证码
reuslt_text = get_image_text()
input = driver.find_element_by_xpath('//div[@class="Input-wrapper"]/input')
input.send_keys(reuslt_text)
else:
print("没有英文验证码")
elif driver.page_source.find("Captcha-chineseContainer") != 1:
src = driver.find_element_by_class_name("Captcha-chineseImg").get_attribute("src")
print(src)
if len(src) > len("data:image/jpg;base64,null"):
# 中文验证码出现
print("出现中文验证码了")
save_captcha(src)
# 退出浏览器
driver.quit()
else:
print("没有中文验证码")
# 判断英文和中文验证处理
sibmit = driver.find_element_by_xpath('//div[@class="Login-content"]/form/button')
# 点击登录
sibmit.click()
time.sleep(5)
# 保存登录成功的照片
driver.save_screenshot("登录成功.png")
if __name__ == "__main__":
zhihulogin()