[Crawler] Get the real file link of BaiduYun shared by user with Chrome

with Python 2.7 + Selenium + Chrome driver

We finally got a viable approach after several unsatisfactory attempts and one among them is:

http://www.cnblogs.com/ghostr/p/5823191.html

  • Here still have a lot work to improve the performance, implement threading for example.

  • History is a sqlite database file which can be easily parse by sqlite3 module. You can browse the data with DB Browser for SQLite

# -*- coding: utf-8 -*-
#----------------------------
# Author: Kun Liu         
# Start date: 2017-03-10 
# Latest edit: 2017-03-13
# Email: lancelotdev@163.com
#=============================
# Read baiduyun file links from chrome history file

"""
### 解决方案:
    1. 制定user data目录,通过 selenium 模拟 chrome 浏览器创建下载任务,但并不完成下载。
    2. 解析 userdata 中的 History 获取真实资源链接。

### Note:
1. 未做资源链接去重处理。
2. 存在多次访问后出现的验证问题,待研究。
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import time
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.alert import Alert

from FileItem import FileItem

user_data_dir_path = "d://userData"

options = webdriver.ChromeOptions()
options.add_argument("user-data-dir=%s"%user_data_dir_path)


# Travel all share url to get history.
def baiduyun_url_travel(share_url_list=[]):
    driver = webdriver.Chrome(chrome_options=options)
    if not share_url_list:
        return
    # Init the user data such as cookie so you won't need to request a url twice.
    driver.get(share_url_list[0])
    for url in share_url_list:
        driver.get(url)
        time.sleep(3)
        js_str = "Object.defineProperty(Object.getPrototypeOf(navigator),'platform',{get:function(){return 'sb_baidu';}})"
        driver.execute_script(js_str)
        try:
            element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, '//*[@id="layoutMain"]/div[1]/div[1]/div/div[2]/div/div/div[2]/a[2]'))
            )

        except Exception as e:
            element = driver.find_element_by_xpath('//*[@id="layoutMain"]/div[1]/div[1]/div/div[2]/div/div/div[2]/a[2]')
        finally:
            element.click()
        time.sleep(5)
    driver.quit()


# 2017-03-13  Liu Kun
# The 'History' file is a sqlite database.
# Some download links may jump to other urls which is clearly marked by Chrome 
# and here I use the direct link without jumping.
def get_source_link_from_history(History_path):
    import sqlite3 as db
    conn = db.connect(History_path)
    cursor = conn.cursor()
    sql = "select id, chain_index, url from downloads_url_chains where chain_index=0"
    rows = cursor.execute(sql).fetchall()
    items = []
    for row in rows:
        id, _, file_link = row
        sql = "select current_path, start_time from downloads where id=%d"%int(id)
        file_info = cursor.execute(sql).fetchone()
        if file_info:
            current_path, time_stamp = file_info
            time_stamp = str(time_stamp)
            # C:\Users\kun_liu\Downloads\shadowsocks-nightly-3.2.7.apk.crdownload
            file_name = current_path.split('\\')[-1].replace('.crdownload','')
            x = time.localtime(int(time_stamp[0:10]))
            # time.strptime(a,'%Y-%m-%d %H:%M:%S')
            start_time = time.strftime('%Y-%m-%d %H:%M:%S',x)
            item = FileItem(file_name, file_link, start_time)
            items.append(item.make_dic())
    return items

if __name__ == "__main__":
    # Movie:https://pan.baidu.com/s/1sl8litZ #App:https://pan.baidu.com/s/1o8K255K
    share_url = ["https://pan.baidu.com/s/1sl8litZ", "https://pan.baidu.com/s/1dFBr37F", "https://pan.baidu.com/s/1o8K255K"]
    baiduyun_url_travel(share_url)
    History_path = os.path.join(user_data_dir_path, "Default", "History")
    items = get_source_link_from_history(History_path)
    import pprint
    pprint.pprint(items)


FileItem.py:
# -*- coding: utf-8 -*-
#----------------------------
# Author: Kun Liu         
# Start date: 2017-03-13  
# Latest edit: 2017-03-13
#=============================

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import pprint

class FileItem:
    def __init__(self, file_name="", file_link="", catch_time= ""):
        self.file_name = file_name
        self.file_link = file_link
        self.file_time = catch_time

    def make_dic(self):
        info_dic = {"file_name":self.file_name, "link":self.file_link, "time":self.file_time}
        return info_dic

if __name__ == "__main__":
    pass

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值