网络爬虫

最新推荐文章于 2024-05-25 21:08:58 发布

tomandmath

最新推荐文章于 2024-05-25 21:08:58 发布

阅读量118

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/qq_42676042/article/details/106933676

版权

python 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

import re

import requests



def getHTMLText(url):

      try:

            r = requests.get(url, timeout = 30)

            r.raise_for_status()

            r.encoding = r.apparent_encoding
            return r.text

      except:

        return "获取连接出错了"



def parsePage(ilt, html):

      try:
#使用re库进行 文字的抽取
            plt = re.findall(r'class="ysM1nrSp1">[^<]*', html)

            tlt = re.findall(r'class="ysM1nrSp2">[^<]*', html)

            for i in range(len(plt)):

                  price = plt[i].split('>')[1]

                  title = tlt[i].split('>')[1]
                  
                  ilt.append([price, title])

      except:

            print("页面解析出错了")



def printGoodsList(ilt):

      tplt = "{:4}\t{:8}\t{:16}"

      print(tplt.format("序号", "NAME", "FAME"))

      count = 0

      for g in ilt:

            count = count+1

            print(tplt.format(count, g[0], g[1]))



def main():

      depth = 1
#此处为欲爬取的url
      url = 'https://eecs.pku.edu.cn/szdw1.htm'

      infoList = []

      for i in range(1):

            try:

                  html = getHTMLText(url)

                  parsePage(infoList, html)


            except:

                  continue

      printGoodsList(infoList)



main()

修改两点即可再爬别的网站

url
正则表达式的修改

tomandmath

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
网络爬虫

import reimport requestsdef getHTMLText(url): try: r = requests.get(url, timeout = 30) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return "获取连接出
复制链接

扫一扫