python爬取网站的某一句话_python简单爬取某网站python教程内容

一般的小白python新手可能都知道廖雪峰网站吧。由于自己也是个小白,所以就想能不能将该教程爬取下来呢。说做就做。好了不多说,直接上代码:

#coding:utf-8

#autor:myndtt

import urllib2

import requests

import os

import multiprocessing

import sys

from bs4 import BeautifulSoup

from lxml import etree

import pdfkit

reload(sys)

sys.setdefaultencoding('utf-8')

url='http://www.liaoxuefeng.com/wiki/001374738125095c955c1e6d8bb493182103fac9270762a000'

def geturl(url):

article = []

try:

re=urllib2.urlopen(url).read()

selector=etree.HTML(re)

content=selector.xpath('//*[@id="main"]/div/div/div/div/div/ul/li/a/@href')

for each in content:

article.append('http://www.liaoxuefeng.com'+each.strip())

except urllib2.HTTPError as e:

pass

return article

def gethtml():

text=u'

廖雪峰Python教程

'+u'
'

a=1

re=urllib2.urlopen(url).read()

selector=etree.HTML(re)

conten=selector.xpath('//*[@id="main"]/div/div/div/div/div/ul/li/a/text()')

#获取目录

for con in conten:

text=text+u'

'+unicode(a)+u':'+unicode(con)+u'

'+u'
'

a=a+1

return text

def getothers(urllist):

n = 0

text=gethtml()

pool = multiprocessing.Pool(multiprocessing.cpu_count())

#获取每份网页要的东西

for ur in urllist:

n=n+1

m=pool.apply_async(getpage,(ur,n,)).get()

text=text+unicode(m)

#提示打印的页数

print n

pool.close()

pool.join()

file = open("pdf.html", "a")

file.write(u'

' + unicode(text) + u'')

pdfcreate()

print "ok!!!"

#实际获取每份网页要的

def getpage(ur,n):

page=u''

rep=urllib2.urlopen(ur).read()

soup=BeautifulSoup(rep,"lxml",from_encoding='utf8')

sou=soup.find("div",{"class":"x-wiki-content"})

smu=soup.find("h4").get_text()

page=page+u'

'+unicode(n)+u':'+unicode(smu)+u'

'

#找到img标签 将其src属性值补全

so=sou.find_all("img")

for s in so:

if str(s).find("http:")== -1:

sou=unicode(sou).replace(s.get('src'),'http://www.liaoxuefeng.com'+s.get('src'))

page=page+unicode(sou)

return unicode(page)

#打印成pdf(其实可有可无)

def pdfcreate():

path_wkthmltopdf = r'C:\Windows\System32\wkhtmltopdf.exe'

config = pdfkit.configuration(wkhtmltopdf=path_wkthmltopdf)

pdfkit.from_url("pdf.html", "hello.pdf", configuration=config)

if __name__=='__main__':

urllis=geturl(url)

getothers(urllis)

其实这是大半年前写的代码,写的有点糟糕,同时用了xpath和beautifulsoup,真是汗颜,也懒的改了(哈哈)。

本博客仅记录一下自己学习生活,如不胜对大家有点借鉴作用,也是极好的。

已标记关键词 清除标记
表情包
插入表情
评论将由博主筛选后显示,对所有人可见 | 还能输入1000个字符
相关推荐
©️2020 CSDN 皮肤主题: 游动-白 设计师:白松林 返回首页