#coding=utf-8
importosimportrefrom selenium importwebdriverimportselenium.webdriver.support.ui as uiimporttimefrom datetime importdatetimeimportIniFile#from threading import Thread
from pyquery importPyQuery as pqimportLogFileimportmongoDBimporturllibclasstoutiaoSpider(object):def __init__(self):
logfile= os.path.join(os.path.dirname(os.getcwd()), time.strftime('%Y-%m-%d') + '.txt')
self.log=LogFile.LogFile(logfile)
configfile= os.path.join(os.path.dirname(os.getcwd()), 'setting.conf')
cf=IniFile.ConfigFile(configfile)
webSearchUrl= cf.GetValue("toutiao", "webSearchUrl")
self.keyword_list= cf.GetValue("section", "information_keywords").split(';')
self.db=mongoDB.mongoDbBase()
self.start_urls=[]for word inself.keyword_list:
self.start_urls.append(webSearchUrl+urllib.quote(word))
self.driver=webdriver.PhantomJS()
self.wait= ui.WebDriverWait(self.driver, 2)
self.driver.maximize_window()defdown_video(self, videourl):"""下载视频到本地
:param videourl: 视频url"""
#http://img.tvhomeimg.com/uploads/2017/06/23/144910c41de4781ccfe9435e736ef72b.jpg
if len(videourl) >0:
fileName= ''
if videourl.rfind('/') >0:
fileName= time.strftime('%Y%m%d%H%M%S') + '.mp4'u=urllib.urlopen(videourl)
data=u.read()
strpath= os.path.join(os.path.dirname(os.getcwd()), 'video')
with open(os.path.join(strpath, fileName),'wb') as f:
f.write(data)defscrapy_date(self):
strsplit= '------------------------------------------------------------------------------------'index=0for link inself.start_urls:
self.driver.get(link)
keyword=self.keyword_list[index]
index= index + 1time.sleep(1) #数据比较多,延迟下,否则会出现查不到数据的情况
selenium_html= self.driver.execute_script("return document.documentElement.outerHTML")
doc=pq(selenium_html)
infoList=[]
self.log.WriteLog(strsplit)
self.log_print(strsplit)
Elements= doc('div[class="articleCard"]')for element inElements.items():
url= 'http://www.toutiao.com' + element.find('a[class="link title"]').attr('href')
infoList.append(url)if len(infoList)>0:for url ininfoList:
self.driver.get(url)
htext= self.driver.execute_script("return document.documentElement.outerHTML")
dochtml=pq(htext)
videourl= dochtml('video[class="vjs-tech"]').find('source').attr('src')ifvideourl:
self.down_video(videourl)
self.driver.close()
self.driver.quit()
obj=toutiaoSpider()
obj.scrapy_date()
本文介绍了一个使用Python Selenium实现的今日头条视频爬虫项目。该爬虫通过PhantomJS浏览器自动化工具抓取指定关键词下的视频链接,并下载视频资源到本地。项目涉及关键词配置、数据库操作、日志记录等功能。

1340

被折叠的 条评论
为什么被折叠?



