想要用Python来模拟一个http客户端,定时从tfile.ru下载电子乐种子文件。
因为第一次获取到的页面是
<html><body><script>document.cookie="tmbx=8ce596fa8478381750c8fafafe050274";docu
ment.location.href="http://tfile.ru/forum/viewforum.php?f=186&attempt=1#r";</scr
ipt></body></html>
需要手动解析js来添加cookie并且重定向。
第一步,从获取页面开始,用到的模块 urllib, urllib2, cookielib; 分析html获取cookie,并加入CookieJar
import urllib, urllib2, cookielib
import re
def PageRetrive(targetUrl, opener):
"retrive html page from targetUrl"
resp = opener.open(targetUrl)
return resp.read()
targetUrl = 'http://tfile.ru/forum/viewforum.php?f=186'
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
resp = PageRetrive(targetUrl, opener);
print resp
# out put file.
f = open('tmp.html', 'w')
##regrex match
m1 = re.search('document.cookie="(\w+)=(\w+)";', resp)
if m1:
print m1.group(1)
print m1.group(2)
ck = cookielib.Cookie(version=0, name=m1.group(1), value=m1.group(2), port=None, port_specified=False, domain='tfile.ru', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)
cj.set_cookie(ck)
for index, cookie in enumerate(cj):
print index, ' : ', cookie
resp = PageRetrive(targetUrl, opener);
print resp
f.write(resp);
第二步就是解析HTML,从HTML中提取子页面的url。
上网搜了下,Python除了HtmlParser,还有ElementTidy,BeautifulSoup和lxml.
决定试一下lxml
2.1 下载安装 lxml. win7x64系统
下载 ez_setup.py,并执行来安装setuptools。
在目录C:\Python27\Scripts下运行 easy_install --allow-hosts=lxml.de,*.python.org lxml
运行 eash_install 总是报错。
所以有找了一种非官方的
http://www.lfd.uci.edu/~gohlke/pythonlibs/
找到相关的exe下载安装就可以了
2.2 解析HTML
使用Xpath语法
tmpDoc = lxml.html.document_fromstring(resp2);
dmtCount2 = tmpDoc.xpath('//a[@class="bDownload button"]');
第三步,实现真正的自动下载。
用的fedaro系统,安装了mldonkey来下载torrent文件。
只要把下载好的种子文件移到mldonkey的.mldonkey/torrents/incoming文件夹下,mldonkey就会开始自动下载。
需要做的就是写一个开启下载与移动文件的脚本,然后把这个脚本加入cron任务中定时运行。
#! /bin/bash
python tfile.py
mv *.torrent ../../.mldonkey/torrents/incoming
添加cron,每天20:00运行
0 20 * * * /pathtofile/run.sh
第四步,保存一个下载过的文件的list在本地,每次下载的时候检测是否已下载。就可以避免重复。顺便学一下list。
#read downloaded
with open('data.txt') as f:
lines = f.read().splitlines()
提取标记并检测所要下载的是否在list中。
m2 = re.search('http://tfile\.ru/forum/download\.php\?id=(\d+)&uk=1111111111', url)
if m2.group(1) in lines:
lines.append(m2.group(1))
把最后结束时的list写入到本地文件
#write list to file
with open('data.txt', 'w') as w:
for item in lines:
w.write("%s\n" % item)
最终代码:
return False
#!/bin/python
import urllib, urllib2, cookielib
import re
import math
import socket
def DownloadTorrent(url,opener, file):
try:
req = opener.open(url, timeout = 25.0)
total_size = int(req.info().getheader('Content-Length').strip())
downloaded = 0
with open(file, 'wb') as fp:
while True:
chunk = req.read()
downloaded += len(chunk)
print math.floor( (float(downloaded) / total_size ) * 100)
if not chunk:
break
fp.write(chunk)
fp.close();
req.close();
except socket.timeout, e:
print ('time out')
return False
except urllib2.HttpError, e:
print 'Http Error:', e.code, url
return False
except urllib2.URLError, e:
print "URL Error:", e.reason, url
return False
return file
def PageRetrive(targetUrl, opener):
try:
resp = opener.open(targetUrl, timeout = 35.0)
file = ""
while True:
chunk = resp.read()
if not chunk:
break
file += chunk
resp.close()
except socket.timeout, e:
print ('Time out')
return False
except urllib2.URLError, e:
return False
return file
targetUrl = 'http://tfile.ru/forum/viewforum.php?f=186'
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
resp = PageRetrive(targetUrl, opener);
#read downloaded
with open('data.txt') as f:
lines = f.read().splitlines()
##regrex match
m1 = re.search('document.cookie="(\w+)=(\w+)";', resp)
if m1:
print m1.group(1)
print m1.group(2)
ck = cookielib.Cookie(version=0, name=m1.group(1), value=m1.group(2), port=None, port_specified=False, domain='tfile.ru', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)
cj.set_cookie(ck)
resp = PageRetrive(targetUrl, opener);
#for html parse
import lxml.html
from lxml import etree as ET
xmlDoc = lxml.html.document_fromstring(resp);
dmtCount = xmlDoc.xpath('//td[@class="t"]/a')
print len(dmtCount)
#iterate
from StringIO import StringIO
import gzip
for idx, val in enumerate(dmtCount):
targetUrl = 'http://tfile.ru' + val.attrib['href']
while True:
resp2 = PageRetrive( targetUrl, opener)
if resp2:
break
tmpDoc = lxml.html.document_fromstring(resp2);
dmtCount2 = tmpDoc.xpath('//a[@class="bDownload button"]');
url = 'http://tfile.ru/forum/' + dmtCount2[len(dmtCount2)-1].attrib['href'];
print url
m2 = re.search('http://tfile\.ru/forum/download\.php\?id=(\d+)&uk=1111111111', url)
if not m2.group(1) in lines:
lines.append(m2.group(1))
while not DownloadTorrent(url, opener, str(idx)+'.torrent'):
DownloadTorrent(url, opener, str(idx)+'.torrent')
#write list to file
with open('data.txt', 'w') as w:
for item in lines:
w.write("%s\n" % item)