#!C:\Python27\python.exe
#coding=utf8
import os
import pdfkit
import urllib2
from bs4 import BeautifulSoup
from multiprocessing import Pool
import socket
socket.setdefaulttimeout(60)
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def url_open(url):
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36'
headers = {'User-Agent': user_agent}
request = urllib2.Request(url=url, headers=headers)
try:
page = urllib2.urlopen(request, timeout=60)
except urllib2.HTTPError as e:
return 1
contents = page.read()
# print contents
soup = BeautifulSoup(contents.decode('utf-8','ignore'), "lxml")
return soup
def retrieve_pdf(dir, link):
savedStderr = sys.stderr
with open('errlog.txt', 'w+') as file:
sys.stderr = file
try:
pdfkit.from_url(link, dir)
pass
finally:
pass
sys.stderr = savedStderr
def strip_char(string):
char = ['*', '/', '\\', ':', '"', '?', '<', '>', '|']
processed = []
for i in string:
if i not in char:
processed.append(i)
return ''.join(processed)
def crawler(root, url, num):
# print url
if url_open(url) != 1:
soup = url_open(url)
# print soup
for tr in soup.find_all("tr"):
# print tr
td = tr.find_all('td')
if list(td) == None:
continue
if len(td) > 0:
if td[0].get_text() == u"提交时间":
continue
date = td[0].get_text()
title = td[1].get_text()
dir = title + '.pdf'
type = td[2].get_text()
poster = td[3].get_text()
print date + " " + title + " " + type + " " + poster
link = root + '.'.join(tr.get('onclick').split('\'')[1].split('.')[1:])
print link
print "Retrieving PDF..."
print dir
dir = strip_char(dir).encode('utf-8').decode('utf-8')
temp_name = 'temp' + str(num) + '.pdf'
try:
retrieve_pdf(temp_name, link)
except Exception:
if os.path.exists(temp_name):
print "Retrieved Successfully!"
os.rename(temp_name, dir)
else:
print 'Retrieve failed!'
continue
def single_func(num):
root = 'http://cb.drops.wiki'
url = "http://cb.drops.wiki/search.php?kind=drops&keywords=&page=" + str(num)
crawler(root, url, num)
if __name__ == '__main__':
# single_func(1) #func test
# for page in range(1, 86):
# single_func(page)
pool = Pool(processes=4)
for i in range(1, 86):
result = pool.apply_async(single_func, (i,))
pool.close()
pool.join()
wooyun知识库爬虫(自动整理保存为pdf)
最新推荐文章于 2024-05-24 09:31:35 发布