8.20(day19)re,typing,collections模块，简单爬虫

最新推荐文章于 2024-07-18 12:57:54 发布

dongjia6380

最新推荐文章于 2024-07-18 12:57:54 发布

阅读量82

点赞数

文章标签：爬虫 python

原文链接：http://www.cnblogs.com/jiann/p/11529054.html

版权

复习

numpy模块

numpy数组(矩阵)的运算, 科学运算, tensorflow

pandas模块

文件(excel)的处理

read_excel()/to_excel()

matplotlib模块

画图, plt.plot()/plt.bar()/plt.scatter()/plt.hist()/plt.pie()

re模块

# re模块   从字符串中找特定的字符串
# import re
# s = '王大炮打炮被大炮打死了王大炮打炮被大炮打死了'
# # ^ 从开头查找
# print(re.findall('^王大炮',s))
# # $ 从结尾查找
# print(re.findall('死了$',s))
# # [] 匹配中间字符，只要单个字符
# s = 'asdfghjkl'
# print(re.findall('[as]',s))
# # [^] 对[]内元素取反
# print(re.findall('[^asd]',s))
# # . 任意字符(除了\n)
# # print(re.findall('a..',s))
#
# s = 'asdasdaaasdaaaaa'
# # * 前面的字符0到无数个
# print(re.findall('a*',s))   #非它字符也要   算空
# # + 1到无穷个
# print(re.findall('a+',s))
# ? 0到1个
# print(re.findall('a?',s))
# {m} 匹配前面字符m个
# print(re.findall('a{5}',s))
# {n,m} 匹配前面字符n到m个
# print(re.findall('a{2,5}',s))

# \d 数字
# s = '123asd456fgh'
# print(re.findall('\d',s))
# \D 非数字
# print(re.findall('\D',s))
# \w 数字字母下划线
# print(re.findall('\w',s))
# \W 非数字字母下划线
# print(re.findall('\W',s))
# \s 空格/\t/\n
# s = 'da-d-fa   f\nak'
# print(re.findall('\s',s))
# \S 非空格\n\t
# print(re.findall('\S',s))
# \取消意义
# s = 'a\s'
# print(re.findall(r'a\\s',s))
# .*贪婪模式，一直寻找
# s = '123asdfgh45asasdada'
# print(re.findall('a.*s',s))
# .*? 非贪婪模式
# print(re.findall('a.*?s',s))
# ()   只要括号n内的
# A|B   ABA都要


# re模块的用法
 # re.compile
'''
修饰符 描述
re.I    使匹配对大小写不敏感
re.L    做本地化识别（locale-aware）匹配
re.M    多行匹配，影响 ^ 和 $
re.S    使 . 匹配包括换行在内的所有字符
re.U    根据Unicode字符集解析字符。这个标志影响 \w, \W, \b, \B.
re.X    该标志通过给予你更灵活的格式以便你将正则表达式写得更易于理解。
'''
# s = 'asdfgh\njkl123'
# print(re.findall('\d+',s))
# com = re.compile('\d+')
# # par = '\d+'
# com = re.compile('3.')
# print(re.findall(com.s))

# re.split()   按照匹配规则切割
# re.sub()   按照匹配规则替换
# re.subn()   a按照匹配规则替换并计数
# # re.findall()拿出匹配的东西
# re.mathch()从开头搜索，找到打印，没找到就none
# re.seaarh()搜索到第一个就停止

typing模块

# typing模块:提供了三种数据类型   Generator   Iterable   Iterator   三种数据类型，限制函数
from typing import SupportsInt
def f(x:int ,y:int):
    return x+y
res=f(10,20)
print(res)
# 一些数据类型python不支持
# 参数数据类型
# 生成器：generator   可迭代对象迭代器对象
def func(i:int,f:float,b:bool,lt:list,tup:tuple,dic:dict):
    lis = [i,f,b,lt,tup,dic]
    return lis
res = func(1,2,True,[1,2],(1,2),6)    #不错误，不规范
print(res)
def func1(lt):
    print(lt[0])

collections模块

# collections模块：复杂的数据类型
# 有名元组
# p = (1,2)
# from collections import namedtuple
# point = namedtuple('point',['x','y'])
# print(p.x)
# print(p.y)

# # 默认字典
# from _collections  import defaultdict
# dic = defaultdict(lambda:'nan')   #dic={}
# dic['a']=1
# print(dic['a'])
# print(dic['c'])
#
# # 双端队列
# # lis = [1,2,3]
# # lis.append(4)
# # print(lis)
# from collections  import deque
# de = deque([1,2,3,])
# de.append(4)
# de.appendleft(0)
# print(de)
# 计数器
# from collections import Counter
# s = 'programming'
# dic = {}
# for i in s:
#     if i in dic:
#         dic[i]+=1
#     else:
#         dic[i]=1
# print(dic)
# c = Counter()   #字典
# for i in s:
#     c[i]+=1
# print(c)

简单爬虫

#简单爬虫
# import requests
# res = requests.get('http://duanziwang.com/')
# data = res.text
# # print(data)
# import re
# res = re.findall(' <div class="post-content">        <p>(.*?)</p>    </div>',data)
# for i in res:
#     print(i)
import re
import os
import requests
for i in range(1,9):
    url = f"http://www.xiaohuar.com/list-2-{i}.html"
    res = requests.get(url)
    data = res.text
    res = re.findall('src="(.*?.jpg)"',data)
    for i in res:
        if i.startswith(''):
            i = f"http://www.xiaohuar.com{i}"
            img_name = i.split('/')[-1]
            img_path = os.path.join('img',img_name)
            res = requests.get(i)
            img_content = res.content
            with open(img_path,'wb') as fw:
                fw.write(img_content)
                fw.flush()
                print(f"下载图片{img_name}成功")

转载于:https://www.cnblogs.com/jiann/p/11529054.html

dongjia6380

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
8.20(day19)re,typing,collections模块，简单爬虫

复习numpy模块numpy数组(矩阵)的运算, 科学运算, tensorflowpandas模块文件(excel)的处理read_excel()/to_excel()matplotlib模块画图, plt.plot()/plt.bar()/plt.scatter()/plt.hist()/plt.pie()re模块# re模块从字符串中找特定的字符串# imp...
复制链接

扫一扫