Python爬虫实例

最新推荐文章于 2024-08-04 18:30:00 发布

NightCharm

最新推荐文章于 2024-08-04 18:30:00 发布

阅读量957

点赞数

分类专栏： Python

本文链接：https://blog.csdn.net/nightcharm/article/details/61196743

版权

Python 专栏收录该内容

21 篇文章 0 订阅

订阅专栏

#-*-conding:utf-8-*-
import urllib.request
import re
import bs4
#入口url
import time


url_mian = 'http://tieba.baidu.com/f?kw=%E5%89%91%E7%BD%913&fr=index&fp=0&ie=utf-8&red_tag=q3464037905'




#下载网页
def download(url,num_retries = 2,user_agent='wswp'):
    print("Downloading:",url)
    headers = {'User-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}

    reheaders = urllib.request.Request(url,headers=headers)
    #判断url是否可访问
    try:
        html = urllib.request.urlopen(url).read().decode('utf-8')
    except UnicodeDecodeError:
        # html = urllib.request.urlopen(url).read()
        return None
        # Urls.remove(url)
    except urllib.request.URLError as e:
        print('Downloading error',e.reason)
        html = None
        #判断是否5xx码错误
        if num_retries>0:
            if hasattr((e,'code')) and 500 <=e.code< 600:
                return download(url,num_retries-1)
    except:
        print('asdddddddddddddddddddd')
    print('网页下载完成')
    # print(html)
    return html

#拿到当前网页的标题和url地址  第一个页面所有的数据
def getUrl(url):
    content = download(url)
    if content != None:
        print('Download is success')
        reg = r'a href=(.*?) class="j_th_tit "'
        ref = re.compile(reg)
        cont = re.findall(ref,content)
        for i in cont:
            splicContent(i)
        print(Urls)
        url = findNext(url)
        if url != None:
            getUrl(url)
        else:
            return
    else:
        Urls.remove(url)

Urls=[]
#拆分拿到的内容
def splicContent(i):
    print('开始拆分......')
    list = i.split(' ')
    titles = list[1]
    if titles.find('818')!= -1 or titles.find('八一八')!= -1 or titles.find('树洞')!= -1 or titles.find('回忆')!=-1:
        titles = titles[6:]
        ends = len(list[0])
        url = 'https://tieba.baidu.com' + list[0][1:ends-1] + '?see_lz=1'
        #拿到具体的url
        print('开始创建文件.....')
        lookInUrl(url)#*******开始爬取内容
        if url in Urls:
            pass
        else:
            Urls.append(url)
            return Urls
        # print (Urls)#'https://tieba.baidu.com/p/5013522336"'

#查找是否含有下一页,如果有则返回下一页的url
def findNext(url):
    html = download(url)
    if html != None:
        reg = r'a href=.* class="next pagination-item "'
        ref = re.compile(reg)
        cont = re.findall(ref,html)
        if len(cont) != 0:
            #获取下一页的url
            list = cont[0].split(' ')
            ends= len(list[1])
            nextUrl =(list[1][6:ends-1])
            return (nextUrl)
        else:
            print('已经没有下一页')
            return None
    else :
        return None


Contents = []
title = ""
#下载具体页面
def lookInUrl(url):
    global title
    html = download(url)
    if html != None:
        # <div id="post_content_105016433767" class="d_post_content j_d_post_content ">
    #判断是否是第一页
        reup = r'a href=".*?">上一页'
        refup = re.compile(reup)
        isFirst = re.findall(refup, html)
        if len(isFirst) == 0:

            # <h3 class="core_title_txt pull-left text-overflow  "
            # title="树洞，昨天打了个3v3，然后我被奶秀亲友拉黑了"
            # style="width: 396px">树洞，昨天打了个3v3，然后我被奶秀亲友拉黑了</h3>
            # < h3 class ="core_title_txt pull-left text-overflow   vip_red " title="【年度818】远程跳蛋好玩吗？" style="width: 396px" > 【年度818】远程跳蛋好玩吗？ < / h3 >
            # print(html)
            #<h3 class="core_title_txt pull-left text-overflow   vip_red " title="【年度818】远程跳蛋好玩吗？" style="width: 396px">【年度818】远程跳蛋好玩吗？</h3>
            try:
                reg = r'h3 class="core_title_txt pull-left text-overflow  " title="(.*?)</h3>'
                # reg = r'h3 class="core_title_txt pull-left text-overflow   vip_red " title=".*?"'
                ref = re.compile(reg)
                title = re.findall(ref, html)
                print(title)
                title = (title[0]).split('>')[1]
                print(title)  # ************************************标题准备Ok
            except:
                reg = r'h3 class="core_title_txt pull-left text-overflow   vip_red " title=".*?"'
                ref = re.compile(reg)
                title = re.findall(ref, html)
                title = (title[0].split(' ')[8][7:-1])
                print(title)
            # 抓取内容
        try:
            regg = r'<div id=".*" class="d_post_content j_d_post_content ">(.*?)</div>'
            reff = re.compile(regg)
            content = re.findall(reff,html)
            print(content)
        except:
           print('内容获取时异常')
           Excepts(url)
        # print(len(content))
        # for i in content:
        #     print(i)
        Contents.append(content)
        allContents = getContent(html)#*****************拿到帖子内所有信息
        writer( title,Contents)
    else:
        return None
def Excepts(url):
    print(url)
    content = urllib.request.urlopen(url)
    soup = bs4.BeautifulSoup(content, 'html.parser')
    soup.original_encoding
    html = soup
    # print(html)
    content = soup.find_all('div', {'class': 'd_post_content j_d_post_content '})
    print(content)
    Contents.append(content)
    getContent(html)
def getAllExcepts(url):
    content = urllib.request.urlopen(url)
    soup = bs4.BeautifulSoup(content, 'html.parser')
    soup.original_encoding
    html = soup
    Next = soup.find_all('li',{'class':'l_pager pager_theme_5 pb_list_pager'})
    for i in Next:
        if i.find('下一页'):
            print(i)
#拿到所有内容
def getContent(html):

#<a href="/p/5013807180?see_lz=1&pn=2">下一页</a>
    regg = r'a href=".*?">下一页'
    reff = re.compile(regg)
    lls = re.findall(reff,html)
    if len(lls) != 0:
        newurl = 'https://tieba.baidu.com'+lls[0][8:-5]
        lookInUrl(newurl)
    else:
        # for i in Contents:
        #     for n in i:
        #         print('    '+n+'\n\n\n')
        return Contents




def writer(title,content):
    # print(title)
    try:
        f = open('E:/J3/'+title+'.html','w',encoding='utf-8')
        f.write('<!DOCTYPE html>')
        f.write('<html>')
        f.write('<head>')
        f.write('<meta charset="UTF-8">')
        f.write('<title>%s</title>'%title)
        f.write('</head>')
        f.write('<body>')
        f.write('<table>')
        print('开始写入表格')
        for i in content:
            # print(type(i))
            for n in i:
                # print(type(n))
                # print(n)
                f.write('<tr>%s</tr>'%n)
        f.write('</table>')
        f.write('</body>')
        f.write('</html>')
        print('写入完成')
        time.sleep(2)
    except:
        return
# urlssss='https://tieba.baidu.com/p/4509181593?see_lz=1'
# print(lookInUrl(urlssss))
(getUrl(url_mian))

Python 才学没几天 刚看到爬虫就动手写了一个  都在一个类中QwQ  以后会重构的

NightCharm

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录