python爬取廖雪峰的python3教程

最新推荐文章于 2020-10-28 21:08:18 发布

不才陈某

最新推荐文章于 2020-10-28 21:08:18 发布

阅读量2.8k

点赞数

文章标签： python

文章首发于微信公众号【码猿技术专栏】，欢迎关注！！！

本文链接：https://blog.csdn.net/qq_34162294/article/details/53891076

版权

#coding:utf-8
import requests
from bs4 import BeautifulSoup
import lxml
import codecs
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

#得到所有页面的url
def get_url(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}
    request = requests.get(url, headers=headers)
    html = request.text
    all_url = []
    soup = BeautifulSoup(html, 'lxml')
    all_a = soup.find('div', class_='x-sidebar-left-content').find_all('a')
    for a in all_a:
        all_url.append('http://www.liaoxuefeng.com' + a.get('href'))
    return all_url

#下载
def download_text(all_url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}
    for url in all_url:   #浏览每一个页面，然后下载
        re=requests.get(url,headers=headers)
        html=re.text
        soup=BeautifulSoup(html,'lxml')
        all_h4=soup.find_all('h4')
        div=soup.find('div',class_='x-wiki-content')
        content=all_h4[0].text+div.text
        if all_h4[0].text=='map/reduce':   #这是判断里面的名字可能和路径相冲突
            filename='g:\\file\\'+'map and reduce'+'.py'
        elif all_h4[0].text==u'TCP/IP简介':
            filename='g:\\file\\'+'TCP and IP'+'.py'
        elif all_h4[0].text=='async/await':
            filename='g:\\file\\'+'async and await'+'.py'
        else:
            filename='g:\\file\\'+all_h4[0].text+'.py'
            print all_h4[0].text
        print u'正在下载'+all_h4[0].text
        with codecs.open(filename,'w',encoding='utf-8') as f:   #将信息写入py文件中
            f.write(content)

if __name__=="__main__":
    url = 'http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000'
    all_url=get_url(url)
    download_text(all_url)