python学习2019/05/05,1.12.33

最新推荐文章于 2022-08-10 10:51:25 发布

为什么我会那么逗

最新推荐文章于 2022-08-10 10:51:25 发布

阅读量132

点赞数

本文链接：https://blog.csdn.net/xtbsl/article/details/89859283

版权

爬虫

爬虫的前奏：
1.明确目的
2.找到对应的网页
3.分析网页的结构找到数据所在的标签位置

1.模拟http请求，向服务器发送这个请求
获取到服务器返回给我们的html
2.用正则表达式提取我们要的数据

import re
from urllib import request
#断点调试 F5
class Spider():
url = ‘https://live.ixigua.com/category/1/114/’ # 调用网站
root_patten = ‘

[\S\s]*?

’ #搜索到需要的代码列

def __fetch_content(self):   #私有方法
    r = request.urlopen(Spider.url)
    #bytes   字节
    htmls = r.read()                                                    #读取
    htmls = str(htmls,encoding = 'utf-8')               #网站编码的解码
    return htmls                                                      #返回

def __analysis(self,htmls):
    root_html = re.findall(Spider.root_patten,htmls)            #查找
    anchor = []
    for html in root_html:
        name = re.findall(Spider,name_pattern,html)            #查找名字
        number = re.findall(Spider.number_pattern,html)         #查找数字
        anchor = {'name':name,'number':number}            #赋值为dict，进行输出
        anchors.append(anchor)
    print(anchors[0])
    return anchors                                   #返回值

def __refine(self,anchors):
    pass 

def go(self):                                                    #做一个返回值输出
    htmls = self.__fetch_content()
    anchors = self.__analysis(htmls)
    self.__refine(anchors)

spider = Spider()
spider.go() #返回需要的值

#失败中。。。

1.12.33