一般的小白python新手可能都知道廖雪峰网站吧。由于自己也是个小白,所以就想能不能将该教程爬取下来呢。说做就做。好了不多说,直接上代码:
#coding:utf-8
#autor:myndtt
import urllib2
import requests
import os
import multiprocessing
import sys
from bs4 import BeautifulSoup
from lxml import etree
import pdfkit
reload(sys)
sys.setdefaultencoding('utf-8')
url='http://www.liaoxuefeng.com/wiki/001374738125095c955c1e6d8bb493182103fac9270762a000'
def geturl(url):
article = []
try:
re=urllib2.urlopen(url).read()
selector=etree.HTML(re)
content=selector.xpath('//*[@id="main"]/div/div/div/div/div/ul/li/a/@href')
for each in content:
article.append('http://www.liaoxuefeng.com'+each.strip())
except urllib2.HTTPError as e:
pass
return article
def gethtml():
text=u'<h1>廖雪峰Python教程<h1>'+u'<br>'
a=1
re=urllib2.urlopen(url).read()
selector=etree.HTML(re)
conten=selector.xpath('//*[@id="main"]/div/div/div/div/div/ul/li/a/text()')
#获取目录
for con in conten:
text=text+u'<h2>'+unicode(a)+u':'+unicode(con)+u'<h2>'+u'<br>'
a=a+1
return text
def getothers(urllist):
n = 0
text=gethtml()
pool = multiprocessing.Pool(multiprocessing.cpu_count())
#获取每份网页要的东西
for ur in urllist:
n=n+1
m=pool.apply_async(getpage,(ur,n,)).get()
text=text+unicode(m)
#提示打印的页数
print n
pool.close()
pool.join()
file = open("pdf.html", "a")
file.write(u'<html><head><meta charset="UTF-8"></head><body>' + unicode(text) + u'</body></html>')
pdfcreate()
print "ok!!!"
#实际获取每份网页要的
def getpage(ur,n):
page=u''
rep=urllib2.urlopen(ur).read()
soup=BeautifulSoup(rep,"lxml",from_encoding='utf8')
sou=soup.find("div",{"class":"x-wiki-content"})
smu=soup.find("h4").get_text()
page=page+u'<h2>'+unicode(n)+u':'+unicode(smu)+u'</h2>'
#找到img标签 将其src属性值补全
so=sou.find_all("img")
for s in so:
if str(s).find("http:")== -1:
sou=unicode(sou).replace(s.get('src'),'http://www.liaoxuefeng.com'+s.get('src'))
page=page+unicode(sou)
return unicode(page)
#打印成pdf(其实可有可无)
def pdfcreate():
path_wkthmltopdf = r'C:\Windows\System32\wkhtmltopdf.exe'
config = pdfkit.configuration(wkhtmltopdf=path_wkthmltopdf)
pdfkit.from_url("pdf.html", "hello.pdf", configuration=config)
if __name__=='__main__':
urllis=geturl(url)
getothers(urllis)
其实这是大半年前写的代码,写的有点糟糕,同时用了xpath和beautifulsoup,真是汗颜,也懒的改了(哈哈)。
本博客仅记录一下自己学习生活,如不胜对大家有点借鉴作用,也是极好的。