【python】爬取小说网站文章

背景:

之前发了一篇收集素材的,现在来一篇收集素材来源的,因为代码较为简单,为了防止报错导致重新爬取,故这里把爬取数据列表和数据内容分开。(爬取的数据仅用于学习,切勿商业)

 

代码(获取列表):

import requests,time
from bs4 import BeautifulSoup

def get_one_page(url):
  headers = {
      'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 '
                   'Safari/537.36 '
  }
  requests.packages.urllib3.disable_warnings()  #避免出现警告信息
  response = requests.get(url,headers=headers,verify=False)

  if response.status_code == 200:     #根据状态码判断是否访问成功
      response.encoding = response.apparent_encoding
      return response.text
  return None

def getList(soup):
    pattern = soup.find('div', {'id': 'newscontent'})
    pattern = pattern.find('ul')
    list = pattern.find_all("span", {"class":{"s2"}});
    result = [];
    for span in list:
        tagA = span.find('a');
        result.append("【"+tagA.get_text()+"】 "+tagA['href'])
    write(result);



def write(result):
    with open('data.txt','a',encoding='utf-8') as f:  # 设置文件对象
        for index, word in enumerate(result):
            print(word)
            f.write('\n'+word)


listUrl = 'http://www.biquge.info/list/6_1.html';
if __name__ == '__main__':
    html = get_one_page(listUrl)
    soup = BeautifulSoup(html, 'lxml')
    #获取分页
    pagelink = soup.find('div', {'id': 'pagelink'})
    last = pagelink.find("a",{"class":"last"}).get_text();

    for pageIndex in range(int(last)):
        html = 'http://www.biquge.info/list/6_'+str(pageIndex+1)+".html";
        html = get_one_page(html)
        soup = BeautifulSoup(html, 'lxml')
        print('当前是第'+str(pageIndex+1)+'页');
        getList(soup);
        time.sleep(3)

控制台效果:

 txt效果:

 

 

爬取完列表后,便开始爬取具体内容:

代码(内容):

import requests,time,re
from bs4 import BeautifulSoup
import bs4




def get_one_page(url):
  headers = {
      'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 '
                   'Safari/537.36 '
  }
  requests.packages.urllib3.disable_warnings()  #避免出现警告信息
  response = requests.get(url,headers=headers,verify=False)

  if response.status_code == 200:     #根据状态码判断是否访问成功
      response.encoding = response.apparent_encoding
      return response.text
  return None


def parse_one_page(html):
  soup = BeautifulSoup(html,'lxml')
  chapter_name = soup.h1.string
  pattern = soup.find('div',{'id':'content'})
  next_url = re.search("<a.*?上一章.*?(章节目录|章节列表).*?a href=\"(.*?)\">下一章.*?报错欠更</a>", html, re.S).group(2)
  book_list = []   #创建空列表用来存放文本内容
  if chapter_name:
    book_list.append(chapter_name)
  if pattern:
      for br in pattern:
          if isinstance(br,bs4.element.NavigableString):  #文本类型为NavigableString
              # print(br.string)
              if br.string and br.string != 'go' and br.string != 'over':
                 book_list.append(br.string)
  # print(book_list)
  print(chapter_name,' 已经完成下载',' 下一章节:',next_url)
  return book_list,next_url




def write(result):
    with open('data.txt','a',encoding='utf-8') as f:  # 设置文件对象
        for index, word in enumerate(result):
            print(word)
            f.write('\n'+word)


fileNameList = [];
def read():
    with open('data.txt','r',encoding='utf-8') as f:  # 设置文件对象
        for line in f.readlines():
            line = line.strip('\n')  #去掉列表中每一个元素的换行符
            line = re.search(r'【.*】',line)
            if line and title:
                title = line.group()
                title = re.sub(r"(【|】)",'',title)
                fileNameList.append(title);


def getContent(bookUrl,url,fileName):
    start = time.clock()
    print('开始下载,请等待...')
    while True:
        num = 0
        while True:
            html = get_one_page(url)
            num = num + 1;
            if num>10 or html!=None:
                break;
        content, next_url = parse_one_page(html)
        with open('doc/'+fileName+'.txt', 'a+', encoding='utf-8') as f:
            for i in content:
                f.write(i)
        url = next_url
        if url == bookUrl:
            break
    print('下载已完成')
    end = time.clock()
    print('Running time:%.4s Seconds' % (end - start))
    time.sleep(2)

def getFileList():
    fileList = [];
    with open('data.txt', 'r', encoding='utf-8') as f:  # 设置文件对象
        for line in f.readlines():
            line = line.strip('\n')  # 去掉列表中每一个元素的换行符
            if line:
                fileList.append(line)
        return fileList


if __name__ == '__main__':
    fileList = getFileList();
    for file in fileList:
        link = re.search(r'http(.*)', file)
        title = re.search(r'【.*】', file)
        if link:
            title = title.group()
            link = link.group()
            html = get_one_page(link)
            soup = BeautifulSoup(html, 'lxml')
            pattern = soup.find('div', {'id': 'list'})
            pattern = pattern.find('a');
            a = pattern['href']
            newHref = link+a;
            print(title)
            getContent(link,newHref,title)


 

效果(控制台):

 

txt效果:

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值