Python自动化（一）使用Selenium+PhantomJS爬取电影下载链接

最新推荐文章于 2024-07-25 14:52:29 发布

置顶 Gavinsun

最新推荐文章于 2024-07-25 14:52:29 发布

阅读量2.1k

点赞数 2

分类专栏： Python 文章标签： python selenium xpath utf-8

本文链接：https://blog.csdn.net/gavinsun/article/details/77947751

版权

Python 专栏收录该内容

25 篇文章 0 订阅

订阅专栏

采集第一页中所有电影的名称和迅雷链接


#coding:utf-8
from selenium import webdriver
import codecs

driver = webdriver.PhantomJS()
driver.get("http://www.poxiao.com/mtype5.html")
movies = driver.find_elements_by_xpath('//*/li/h3/a')
cur_window = driver.current_window_handle #记录当前浏览器标签

#遍历每部电影，并把电影的名字和对应的迅雷链接写入文件中。
f = codecs.open("movies.csv",'w',encoding='utf-8')
for movie in movies:
    try:
        f.write(movie.text)#输出电影的名字
        f.write(',')
        movie.click()  #点击电影，进入电影详情页面
        #在详情页中找到迅雷链接
        total_tab = driver.window_handles  #获得当前浏览器打开的所有标签
        driver.switch_to.window(window_name=total_tab[1]) #转到详情页
        thunder_link = driver.find_element_by_xpath('.//*/td[@class="sebc3"]/a')
        f.write(thunder_link.get_attribute('href'))
        driver.close()
        driver.switch_to.window(window_name=cur_window)
        f.write("\n")
    except Exception,e:
        continue

f.close()
driver.quit()

采集科幻片分类下的所有电影的名称和对应的迅雷链接。

# coding:utf-8
from selenium import webdriver
import codecs

driver = webdriver.PhantomJS()
driver.get("http://www.poxiao.com/mtype5.html")
cur_window = driver.current_window_handle  # 记录当前浏览器标签
f = codecs.open("movies.csv", 'w', encoding='utf-8')

while True:
    print u"开始下载", driver.current_url
    print u"当前共打开{total_tab}个标签页。".format(total_tab=len(driver.window_handles))
    movies = driver.find_elements_by_xpath('//*/li/h3/a')

    # 遍历每部电影，并把电影的名字和对应的迅雷链接写入文件中。

    for movie in movies:
        f.write(movie.text)  # 输出电影的名字
        print movie.text
        f.write(',')
        movie.click()  # 点击电影，进入电影详情页面
        # 在详情页中找到迅雷链接
        total_tab = driver.window_handles  # 获得当前浏览器打开的所有标签
        driver.switch_to.window(window_name=total_tab[1])  # 转到详情页
        try:
            thunder_link = driver.find_element_by_xpath(
                './/*/td[@class="sebc3"]/a')
            f.write(thunder_link.get_attribute('href'))
            f.write("\n")
        except Exception, e:
            print u"获得电影链接失败。"
            print driver.current_url
        finally:
            driver.close()
            driver.switch_to.window(window_name=cur_window)

    # 找下一页，如果找不到，break
    try:
        next_page = driver.find_element_by_link_text("下一页")

    except Exception, e:
        print u"没有找到下一页"
        print driver.current_url
        break
    next_page.click()

f.close()
driver.quit()