【python】爬取小说网站文章

最新推荐文章于 2024-08-19 22:50:52 发布

gz-郭小敏

最新推荐文章于 2024-08-19 22:50:52 发布

阅读量554

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/github_39570717/article/details/107432809

版权

python 专栏收录该内容

16 篇文章 0 订阅

订阅专栏

背景：

之前发了一篇收集素材的，现在来一篇收集素材来源的，因为代码较为简单，为了防止报错导致重新爬取，故这里把爬取数据列表和数据内容分开。（爬取的数据仅用于学习，切勿商业）

代码（获取列表）：

import requests,time
from bs4 import BeautifulSoup

def get_one_page(url):
  headers = {
      'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 '
                   'Safari/537.36 '
  }
  requests.packages.urllib3.disable_warnings()  #避免出现警告信息
  response = requests.get(url,headers=headers,verify=False)

  if response.status_code == 200:     #根据状态码判断是否访问成功
      response.encoding = response.apparent_encoding
      return response.text
  return None

def getList(soup):
    pattern = soup.find('div', {'id': 'newscontent'})
    pattern = pattern.find('ul')
    list = pattern.find_all("span", {"class":{"s2"}});
    result = [];
    for span in list:
        tagA = span.find('a');
        result.append("【"+tagA.get_text()+"】 "+tagA['href'])
    write(result);



def write(result):
    with open('data.txt','a',encoding='utf-8') as f:  # 设置文件对象
        for index, word in enumerate(result):
            print(word)
            f.write('\n'+word)


listUrl = 'http://www.biquge.info/list/6_1.html';
if __name__ == '__main__':
    html = get_one_page(listUrl)
    soup = BeautifulSoup(html, 'lxml')
    #获取分页
    pagelink = soup.find('div', {'id': 'pagelink'})
    last = pagelink.find("a",{"class":"last"}).get_text();

    for pageIndex in range(int(last)):
        html = 'http://www.biquge.info/list/6_'+str(pageIndex+1)+".html";
        html = get_one_page(html)
        soup = BeautifulSoup(html, 'lxml')
        print('当前是第'+str(pageIndex+1)+'页');
        getList(soup);
        time.sleep(3)

控制台效果：

txt效果：

爬取完列表后，便开始爬取具体内容：

代码（内容）：

import requests,time,re
from bs4 import BeautifulSoup
import bs4




def get_one_page(url):
  headers = {
      'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 '
                   'Safari/537.36 '
  }
  requests.packages.urllib3.disable_warnings()  #避免出现警告信息
  response = requests.get(url,headers=headers,verify=False)

  if response.status_code == 200:     #根据状态码判断是否访问成功
      response.encoding = response.apparent_encoding
      return response.text
  return None


def parse_one_page(html):
  soup = BeautifulSoup(html,'lxml')
  chapter_name = soup.h1.string
  pattern = soup.find('div',{'id':'content'})
  next_url = re.search("<a.*?上一章.*?(章节目录|章节列表).*?a href=\"(.*?)\">下一章.*?报错欠更</a>", html, re.S).group(2)
  book_list = []   #创建空列表用来存放文本内容
  if chapter_name:
    book_list.append(chapter_name)
  if pattern:
      for br in pattern:
          if isinstance(br,bs4.element.NavigableString):  #文本类型为NavigableString
              # print(br.string)
              if br.string and br.string != 'go' and br.string != 'over':
                 book_list.append(br.string)
  # print(book_list)
  print(chapter_name,' 已经完成下载',' 下一章节：',next_url)
  return book_list,next_url




def write(result):
    with open('data.txt','a',encoding='utf-8') as f:  # 设置文件对象
        for index, word in enumerate(result):
            print(word)
            f.write('\n'+word)


fileNameList = [];
def read():
    with open('data.txt','r',encoding='utf-8') as f:  # 设置文件对象
        for line in f.readlines():
            line = line.strip('\n')  #去掉列表中每一个元素的换行符
            line = re.search(r'【.*】',line)
            if line and title:
                title = line.group()
                title = re.sub(r"(【|】)",'',title)
                fileNameList.append(title);


def getContent(bookUrl,url,fileName):
    start = time.clock()
    print('开始下载，请等待...')
    while True:
        num = 0
        while True:
            html = get_one_page(url)
            num = num + 1;
            if num>10 or html!=None:
                break;
        content, next_url = parse_one_page(html)
        with open('doc/'+fileName+'.txt', 'a+', encoding='utf-8') as f:
            for i in content:
                f.write(i)
        url = next_url
        if url == bookUrl:
            break
    print('下载已完成')
    end = time.clock()
    print('Running time:%.4s Seconds' % (end - start))
    time.sleep(2)

def getFileList():
    fileList = [];
    with open('data.txt', 'r', encoding='utf-8') as f:  # 设置文件对象
        for line in f.readlines():
            line = line.strip('\n')  # 去掉列表中每一个元素的换行符
            if line:
                fileList.append(line)
        return fileList


if __name__ == '__main__':
    fileList = getFileList();
    for file in fileList:
        link = re.search(r'http(.*)', file)
        title = re.search(r'【.*】', file)
        if link:
            title = title.group()
            link = link.group()
            html = get_one_page(link)
            soup = BeautifulSoup(html, 'lxml')
            pattern = soup.find('div', {'id': 'list'})
            pattern = pattern.find('a');
            a = pattern['href']
            newHref = link+a;
            print(title)
            getContent(link,newHref,title)