python爬取实战

python Selenium爬取实战**

第一步,我们先导入相应的库

````import json`

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import logging
from urllib.parse import urljoin
from os import makedirs
from os.path import exists
` ````

其实,我们这些库在我们写代码的时候,需要什么就导入什么,不过,这里就直接给出,了解一下这些库。大部分是selenium的一些库

#### 第二步:初始话浏览器

````from selenium.webdriver import Chrome#导入浏览器的包 #打开浏览器 web=Chrome() #打开浏览器,请求网站 web.get(‘https://www.baidu.com’) ````

为了避免代码的冗余,我们代码采用的是def封装函数的,减少代码的重写,最好的是通过面向对象对代码进行编写,这样子可以最大化的避免代码的重写。
各种进阶资料领取位置,戳这里

定义通用爬取的函数方法:

````def scrape_page(url, condition, locator): # 定义通用爬取方法 logging.info(‘scraping %s’, url) try: browser.get(url) wait.until(condition(locator)) # 等待 except TimeoutException: # 报错处理 logging.error(‘error occurred while scraping %s’, url, exc_info=True) ````

定义一个函数,获取爬取的开始url:

````def scrapge_index(page): url = INDEX_URL.format(page=page) # 完善url scrape_page(url, condition=EC.visibility_of_all_elements_located, # 元素可见方法 locator=(By.CSS_SELECTOR, ‘#index .item’)) # CSS定位元素 ````

定义详情页:

````def scrape_detail(url): scrape_page(url, condition=EC.presence_of_element_located, locator=(By.TAG_NAME, ‘h2’)) ````

详情页详细爬取:

````def parse_detail(): url = browser.current_url # 获取当前链接 name = browser.find_element_by_tag_name(‘h2’).text categories = [element.text for element in browser.find_elements_by_css_selector(‘.categories button span’)] cover = browser.find_element_by_css_selector(‘.cover’).get_attribute(‘src’) score = browser.find_element_by_class_name(‘score’).text drama = browser.find_element_by_css_selector(‘.drama p’).text return { ‘url’: url, ‘name’: name, ‘categories’: categories, ‘cover’: cover, ‘score’: score, ‘drama’: drama } ````

获取电影详情链接

````def parse_index(): # 获取电影详情链接 elements = browser.find_elements_by_css_selector(‘#index .item .name’) for element in elements: href = element.get_attribute(‘href’) yield urljoin(INDEX_URL, href) ````

存储函数:

````def save_data(data): # 储存 name = data.get(‘name’) data_path = f’{RESULTS_DIR}/{name}.json’ json.dump(data, open(data_path, ‘w’, encoding=‘utf-8’), ensure_ascii=False, indent=2) ````

运行的主函数:

````def main(): try: for page in range(1, TOTAL_PAGE + 1): scrapge_index(page) detail_urls = parse_index() for detail_url in list(detail_urls): logging.info(‘get detail url %s’, detail_url) scrape_detail(detail_url) detail_data = parse_detail() save_data(detail_data) logging.info(‘detail data %s’, detail_data) finally: browser.close()`

if __name__ == '__main__':
main()
` ````
各种进阶资料领取位置,戳这里

完整代码 :

**````** import json`

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import logging
from urllib.parse import urljoin
from os import makedirs
from os.path import exists

logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s') # 日志配置

INDEX_URL = 'https://spa2.scrape.center/page/{page}' # 电影列表页面
TIME_OUT = 10 # 等待时长
TOTAL_PAGE = 10 # 页数

browser = webdriver.Chrome() # 初始化浏览器
wait = WebDriverWait(browser, TIME_OUT) # 配置页面加载最长等待时间

def scrape_page(url, condition, locator): # 定义通用爬取方法
logging.info('scraping %s', url)
try:
browser.get(url)
wait.until(condition(locator)) # 等待
except TimeoutException: # 报错处理
logging.error('error occurred while scraping %s', url, exc_info=True)

def scrapge_index(page):
url = INDEX_URL.format(page=page) # 完善url
scrape_page(url, condition=EC.visibility_of_all_elements_located, # 元素可见方法
locator=(By.CSS_SELECTOR, '#index .item')) # CSS定位元素

def scrape_detail(url):
scrape_page(url, condition=EC.presence_of_element_located,
locator=(By.TAG_NAME, 'h2'))

def parse_detail():
url = browser.current_url # 获取当前链接
name = browser.find_element_by_tag_name('h2').text
categories = [element.text for element in browser.find_elements_by_css_selector('.categories button span')]
cover = browser.find_element_by_css_selector('.cover').get_attribute('src')
score = browser.find_element_by_class_name('score').text
drama = browser.find_element_by_css_selector('.drama p').text
return {
'url': url,
'name': name,
'categories': categories,
'cover': cover,
'score': score,
'drama': drama
}

def parse_index(): # 获取电影详情链接
elements = browser.find_elements_by_css_selector('#index .item .name')
for element in elements:
href = element.get_attribute('href')
yield urljoin(INDEX_URL, href)

RESULTS_DIR = 'results' # 定义目录

exists(RESULTS_DIR) or makedirs(RESULTS_DIR) # 确定路径在不在,没有新建

def save_data(data): # 储存
name = data.get('name')
data_path = f'{RESULTS_DIR}/{name}.json'
json.dump(data, open(data_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=2)

def main():
try:
for page in range(1, TOTAL_PAGE + 1):
scrapge_index(page)
detail_urls = parse_index()
for detail_url in list(detail_urls):
logging.info('get detail url %s', detail_url)
scrape_detail(detail_url)
detail_data = parse_detail()
save_data(detail_data)
logging.info('detail data %s', detail_data)
finally:
browser.close()

if __name__ == '__main__':
main()
` ````各种进阶资料领取位置,戳这里

  • 14
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值