python 普通网页爬取

文件读写​

工欲善其事,必先利其器,在网页爬取之前,先做好文件读写的准备工作!

import os
import chardet
import string
import json
import nltk
from nltk.corpus import stopwords
class Bookworm:

    # 统计英文文章总词数,或某关键字出现次数
    def word_count_en(self, text="", key=""):
        for i in string.punctuation:
            text = text.replace(i, "")
        wordlist = text.lower().split()

        def word_count():
            total = len(wordlist)
            return total

        def keyword_count():
            count = 0
            for k in wordlist:
                if k == key.lower():
                    count += 1
            return count

        if key == "":
            return word_count()
        else:
            return keyword_count()

    # 阅读文本文档
    def txt_read(self,path=r"test.txt", output="list",encoding="utf-8"):
        article = list()
        try:
            with open(path, "r",encoding=encoding) as f:
                _line_list = f.readlines()
            article.append(line.strip() for line in _line_list)
            # 支持不同类型的输出格式
            if output=="set":
                article= set(article)
            elif output=="string":
                article="".join(s.replace("\n","") for s in _line_list)
            else:
                print("txt_read:不支持的输出类型,已切换到默认的列表形式。")
            return article
        except FileNotFoundError as e:
            print(e)
        except:
            with open(path, "rb") as ft:
                cs = chardet.detect(ft.read())
            self.txt_read(path, output="str", encoding=cs["encoding"])
        return []

    # 阅读目标文件夹下所有文本文档,输出一个文章内容的列表
    def dir_read(self,dir="resource",*exception,walk_all=False,txt_read_output="str"):
        articles=[]
        if not walk_all:
            for i in os.listdir(dir):
                if i not in exception:
                    article = self.txt_read(os.path.join(dir, i), output=txt_read_output)
                    articles.extend(article)
        else:
            for root, dirs, txts in os.walk(dir):
                for txt in txts:
                    if txt[-3:] == "txt" and txt not in exception:
                        article = self.txt_read(os.path.join(dir, txt), output=txt_read_output)
                        articles.extend(article)
        return articles

    # 在指定路径下写作
    def txt_write(self,content="",path=r"source directory"):
        dir_=path.split("\\")
        dir_.pop()
        dir_ = "\\".join(dir_)
        self.make_dir(dir_)
        with open(path, 'a',encoding='utf-8') as f:
            f.write(content+"\n")

    # 在指定路径下json格式写作
    def json_write(self,content,path=r"source directory"):
        with open(path, 'a', encoding='utf-8') as f:
            f.write(json.dumps(content, ensure_ascii=False) + '\n')

    # 去除停用词
    def kick_stopwords(self,wordlist,language="zh",filter="normal"):
        stopwords = set()
        if language not in ("zh","en"):
            print("其他语种库走丢了...")
        try:
            if language == "zh":
                if filter == "strong":
                    for i in os.listdir(r"Stopword directory"):
                        _stopwords = self.txt_read(os.path.join(r"Stopword directory", i))
                        stopwords.update(_stopwords)
                elif filter == "normal":
                    root_dir = r"中文停用词表"
                    stopwords.update(self.txt_read(root_dir))
            elif language == "en":
                root_dir = r"Your English stopword"
                stopwords = self.txt_read(root_dir)
        except FileNotFoundError:
            print("停用词表走丢了,快去看看路径是否正确吧!")
        for i in wordlist:
            if i in stopwords:
                wordlist.remove(i)
        return wordlist

    def kick_visited_url(self,urls,old_urls):
        new_urls = set()
        visited = self.txt_read(old_urls,output="set")
        for url in urls:
            if url not in visited:
                new_urls.add(url)
                self.txt_write(url, old_urls)
        return new_urls

    def make_dir(self,file_dir):
        _is_dir_exist = os.path.exists(file_dir)
        if not _is_dir_exist:
            os.makedirs(file_dir)

    def read_file(self,fpath):
        BLOCK_SIZE = 1024
        with open(fpath, 'rb') as f:
            while True:
                block = f.read(BLOCK_SIZE)
                if block:
                    yield block
                else:
                    return

其中,停用词表可以参考这篇文章,并保存在 ../resource/stopword.txt 目录下。接下来,把文件保存为 Bookworm.py ,以备后文作为模块导入。

普通网页的访问

让我们先实现一个最简单的网页请求,requests 库的封装真的很便利!

class Journalist:
    useragent = # 写一个你自己的爬虫名称
    headers1 = {
        'User-Agent': #请自行查看配置 
    }
    headers2 = {
        'User-Agent': #请自行查看配置 (备用)
    }

    def visit_homepage(self, url):
        timeout = 10
        try:
            rp = urp.RobotFileParser()
            rp.set_url(url+'/robots.txt') # 遵守 robots 规范
            rp.read()
            if rp.can_fetch(self.headers1['User-Agent'], url):
                req = requests.get(url, timeout=timeout, headers=self.headers1)
                status_code = req.status_code
                print("访问正常!状态码:%s Working......" %status_code)
                return req.text
            else:
                print("可能被ROBOTS.TXT禁止了。")
        except urllib.error.URLError as e:
            print(f"无法访问URL: {e.reason}","可能的原因:1.网络连接不畅\n2.目标服务器无法访问\n3.请求的URL不存在\n4.没有权限访问该URL")
        return ""

    def visit_one_page(self,url):
        try:
            response = requests.get(url, headers=self.headers1)
            if response.status_code == 200:
                return response.text
            else:
                print("访问失败,status_code:%s"%(response.status_code))
        except RequestException:
            print("访问失败RequestException")
            return None

其中visit_homepage查看主页面,因为 visit_one_page 是访问所属网站下的子页面的,所以不查看 robots.txt 文件了。Journalist 是非常通用的类,作为抽象产品。关于设计模式,可以查看这里

接下来就可以用这两种访问方式运用到各种网页的爬取了。
比如,最容易拿做牺牲品的豆瓣网,不过由于搜索页是动态渲染的,需要用到 selenium 库:

class Doubanbook(Journalist):
    # 常用地址
    homepageurl = "https://www.douban.com/"
    bookurl = "https://book.douban.com/"
    searchurl = "https://search.douban.com/book/subject_search"

    def parse_one_douban_book(self, url):
        book_soup = self.visit_one_page(url)
        if book_soup is not None:
            soup = BeautifulSoup(book_soup, "lxml")
        else:
            return
        book_info = {}
        # 标题
        title = soup.select('h1')[0].text.strip()
        book_info['标题'] = title
        # 提取书本信息
        info_soup = ''.join(re.findall('<div id="info"(.*?)</div>', book_soup, re.S))
        info = re.findall('<span class="pl">(.*?)</span>(.*?)<br/?>', info_soup, re.S)
        for k, v in info:
            k = k.replace(' ', '').replace(':', '')
            href = ' '.join(re.findall('href="(.*?)"', v))
            v = re.sub('<.*?>|[\n]*|\s{2,}|&nbsp;', '', v).strip(':')
            book_info[k] = v if href == '' else v + ' ' + href

        # 内容/作者简介
        def get_intro(intro):
            if intro.select('span.short'):
                intros = intro.select('span[class="all hidden"] div.intro')
            else:
                intros = intro.select('div.intro')
            return '\n'.join(k.get_text() for k in intros[0].select('p'))

        subsoup = soup.select('div.article')[0]
        for indent in subsoup.select('div.related_info>h2+div'):
            if indent.find(class_="intro"):
                content_intro = indent.find(id="link-report")
                if content_intro:
                    book_info['内容简介'] = get_intro(content_intro)
                else:
                    book_info['作者简介'] = get_intro(indent)
        # 作者信息
        author_soup = subsoup.select('div.related_info>h2+div ul li')
        author_info = {}
        if author_soup:
            for m in author_soup:
                if m.text:
                    author_img = m.select('img')
                    figure = author_img[0].get('src')
                    author = author_img[0].get('alt')
                    role = m.select('span.role')[0].text
                    author_info[author + '(' + role + ')'] = figure
            book_info['作者信息'] = author_info
        # 目录
        menu_soup = str(subsoup.select('div.related_info>h2+div+div'))
        if menu_soup:
            menu = '\n'.join(i.strip() for i in re.findall('(.*?)<br/>', menu_soup))
            book_info['目录'] = menu
        return book_info

    def search(self,content,rank=100):
        browser=webdriver.Firefox()
        params={'search_text':content,'cat':'1001'}
        browser.get('https://search.douban.com/book/subject_search?search_text={}&cat={}'.format(params['search_text'],params['cat']))
        browser.implicitly_wait(5)
        elements = browser.find_elements(By.XPATH, "html/body")
        html = browser.execute_script("return arguments[0].outerHTML;", elements[0]) # 使用execute_script执行JavaScript代码,获取每个可见元素的HTML
        browser.close()
        lis=re.findall('(https://[\w/.]+/\d+/).*?title-text\">(.*?)</a>.*?rating[\w\s\"<>/=-]*(\d{1,2}.\d|)</span>.*?pl\">\((\d*).*?</span>', str(html))
        for url,title,rating,pl in lis:
            if rating:
                _rank=doubanrank(float(rating),int(pl))
                if re.search(content,title) and _rank>rank:   #自定义筛选
                    info=self.parse_one_douban_book(url)
                    print(info)
                    # MongoDB().dump_info('journalist',info)
    def doubantopbooks(self):
        return self.visit_homepage("https://book.douban.com/")

#使用方法:Doubanbook.search("what do you want to search?")

但是搜到的书不全是对我们可能需要的,许多书评分也并不高,这里编了一个评分和评价人数的综合指标,可以插入筛选,供参考:

import math
def doubanrank(score, number):
  if number:
      if number > 10000:
          rank = math.exp(score)
      elif number > 2000:
          rank = math.exp(score) * (-125 / number + 1.0125)
      elif number > 400:
          rank = math.exp(score) * (-25 / number + 0.9625)
      elif number > 80:
          rank = math.exp(score) * (-5 / number + 0.9125)
      elif number > 16:
          rank = math.exp(score) * (-1 / number + 0.8625)
      else:
          rank = math.exp(score) * (4 * number / 75)
      return rank
  return None

爬虫也能日常的阅读变得更加直观,常见的方式是生成词云。
比如,TimeWrangler每日必读的澎湃新闻:

class Thepaper(Journalist):
  homepageurl = "https://www.thepaper.cn"
  def save_to_text(self,file_dir, news_dict):
      res = "{} {}\n作者:{}\n时间:{}\n来源:{}\n图片数量:{}\n主题:{}\n关键词:{}\n{}"\
          .format(news_dict["title"],news_dict["url"], news_dict["writer"], news_dict["time"],news_dict["category"],len(news_dict['picture']),news_dict["subject"],news_dict["keyword"],news_dict["article"])
      title = news_dict["title"]
      title = re.sub(r'[\\/:*?|:?!;]', "_", title) # 去除文件名的非法字符
      path = file_dir+os.sep+ title+ ".txt"
      Bookworm().txt_write(res,path)

  def save_pic(self,file_dir, pic_url):
      req = None
      for i in range(len(pic_url)):
          pic_path = file_dir+os.sep+ "%d.jpg" % (i+1)
          try:
              req = requests.get(pic_url[i])
          except requests.exceptions.MissingSchema as e:
              print("提取的图片(%s)URL好像走丢了呢……"%i)
              print(e)
              req = requests.get("http:"+pic_url[i])
          finally:
              if int(req.headers['Content-Length'])/1024>30:
                  img = req.content
                  f = open(pic_path, "wb")
                  f.write(img)

  def save_manuscript(self,root_dir, news_dict, pic_url):
      try:
          title = news_dict['title']
          title = re.sub(r'[\\/:*?|:?!;]', "_", title)
          file_dir = root_dir + os.sep + title
          Bookworm().make_dir(file_dir)
          self.save_to_text(file_dir, news_dict)
          self.save_pic(file_dir, pic_url)
      except Exception as e:
          print("保存出错了……")
          print(e)
          traceback.print_exc()

  def getnews(self):
      homepage = self.visit_homepage(self.homepageurl)
      if homepage != "":
          urls = re.findall("/newsDetail_forward_\d*", homepage, re.S)
          for i in range(len(urls)):
              urls[i] = self.homepageurl + urls[i]
          urls = set(urls)
          new_urls = Bookworm().kick_visited_url(urls, r"Your saving directory + new.txt")
          print("今天一共有%d条新闻。" % len(new_urls))
          if len(new_urls) > 0:
              print("即将阅读澎湃新闻头条,请保持网络畅通。")
          news_dict = {}
          for new_url in new_urls:
              news_dict["url"] = new_url
              try:
                  res = requests.get(new_url).text
                  soup = BeautifulSoup(res, "lxml")
                  # 获取标题
                  title_soup = soup.select("head>title")[0].text.strip()
                  news_title = title_soup[:title_soup.find("_")]
                  news_dict["title"] = news_title
                  # 获取关键词
                  keyword = soup.select("meta[property='keywords']")[0].get("content")
                  news_dict["keyword"] = keyword
                  # 获取时间
                  date = soup.select("div.index_left__LfzyH div.ant-space-item span")[0].text
                  news_dict["time"] = date
                  # 获取作者
                  writer = soup.select("div.index_left__LfzyH>div")[0].text.strip()
                  news_dict["writer"] = writer
                  # 文本来源
                  topic_soup = soup.select("a.index_inherit__A1ImK>span")
                  category = topic_soup[0].text.strip(" >")
                  news_dict["category"] = category
                  # 主题
                  subject = []
                  for i in range(len(topic_soup)):
                      subject_soup = topic_soup[i].text.strip()
                      if subject_soup[0] == "#":
                          subject.append(subject_soup)
                  news_dict["subject"] = subject
                  # 正文和图片
                  news_article = soup.select("div.index_cententWrap__Jv8jK>p")
                  news_pic = soup.select("div.index_cententWrap__Jv8jK>img")
                  tmp_str = ""
                  news_pic_url = []
                  for pic in news_pic:
                      news_pic_url.append(pic.get("src"))
                  for p in news_article:
                      if p.has_attr("class") != True and p.text.strip() != "":
                          tmp_str += p.text + "\r\n"
                  news_dict["picture"] = news_pic_url
                  news_dict["article"] = tmp_str
                  self.save_manuscript('''Your saving directory''', news_dict, news_pic_url)
              except Exception as e:
                  print("抓取错误,已掠过此新闻!")
                  print(e)
                  traceback.print_exc()

#使用方法:Thepaper.getnews()

保存至本地后,就可以用来做词云了(好一大块都是TF-IDF凑行数):

    def _source(self, root_dir):
        return Bookworm().dir_read(root_dir,"news.txt",walk_all=True)

    def _analyze(self,articles):
        wordlist = []
        for article in articles:
            article="".join(re.findall("链接:.*作者:.*时间:.*来源:.*主题:.*关键词:(.*)",article,re.S))
            words = jieba.lcut(article)
            print(words)
            Bookworm().kick_stopwords(words,filter="strong")
            time.sleep(1)
            print(words)
            wordlist.append(words)
        return wordlist

    def TF_IDF(self,wordlist):
        if not isinstance(wordlist, list) or not all(isinstance(doc, list) for doc in wordlist):
            raise ValueError("Input must be a list of lists, where each sublist represents a document.")

        words_length = 0
        list_length = len(wordlist)
        character_freq1 = dict()
        character_freq2 = dict()
        tf = dict()
        idf = dict()
        TF_IDF = dict()

        for words in wordlist:
            if not all(isinstance(word, str) for word in words):
                raise ValueError("All elements in each document must be strings.")

            flag = False
            for word in words:
                words_length += 1
                if word not in character_freq1:
                    character_freq1[word] = 1
                else:
                    character_freq1[word] += 1
                if word not in character_freq2:
                    character_freq2[word] = 1
                    flag = True
                elif flag == False:
                    character_freq2[word] += 1
                    flag = True

        for k, v in character_freq1.items():
            tf[k] = v / words_length

        for k, v in character_freq2.items():
            idf[k] = math.log10(list_length / v)

        for key in tf.keys():
            TF_IDF[key] = tf[key] * idf[key]

        return TF_IDF

    def wordcloud(self,root_dir):
        words=self.TF_IDF(self._analyze(self._source(root_dir)))
        wc = WordCloud(background_color='white', width=1600, height=900, max_font_size=400,
                       font_path='''Your font file''', collocations=False,max_words=50)
        current_time = time.strftime("%Y-%m-%d-%H%M%S", time.localtime())
        try:
            wc.generate_from_frequencies(words)
            wc.to_file('''Your path of saved news ''' + current_time +".png")
        except FileNotFoundError as e:
            print("请检查文件路径是否正确!",e)
        except ValueError as e:
            print("没有捕获任何词!",e)

最后,别忘了导入所需包:

import os
import re
import time
import math
import pandas
import traceback
import jieba
from wordcloud import WordCloud

import urllib.error
import urllib.robotparser as urp
import requests
from bs4 import BeautifulSoup
from requests import RequestException
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

from Bookworm import *
  • 14
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

TimeWrangler

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值