python 之爬虫

爬虫定义:

通过编写程序,模拟浏览器上网,然后让其去互联网上抓取数据的过程。

爬虫框架

在这里插入图片描述

爬虫基础代码

import re
str = "dog rans to cat"
#简单匹配
part1 ="dog"
part2 ="cat"
print(re.search(part1,str))
print(re.search(part2,str))

#灵活匹配
part = r"r[au]n"
print(re.search(part,str))

print(re.search(r"r[A-Z]n", "dog runs to cat"))
print(re.search(r"r[a-z]n", "dog runs to cat"))


print(re.search(r"r[0-9]n", "dog r2ns to cat"))

#按类型匹配
print(re.search(r"r\dn", "run r4n"))
print(re.search(r"r\Dn", "run r4n"))
print(re.search(r"r\sn", "r\nn r4n"))

案例一:从小说网下载小说

from urllib.request import urlopen
from bs4 import BeautifulSoup


html = urlopen("http://www.jueshitangmen.info/tian-meng-bing-can-11.html")\
    .read().decode("utf-8")
print(html)

print("=================================================")
soup=BeautifulSoup(html,features="lxml")

#先找到所有的P标签
all_p = soup.find_all('p')
print(all_p)

print("=================================================")
for i in all_p:
    print("\n",i.get_text)

案例二:下载天气情况

from urllib.request import urlopen
from bs4 import BeautifulSoup


html = urlopen("http://www.weather.com.cn/weather/101270101.shtml")\
    .read().decode("utf-8")
#print(html)

print("===========================")
soup=BeautifulSoup(html,features="lxml")

#先找到所有的P标签
all_ul = soup.find_all('ul',attrs={"class":"t clearfix"})
all_li = all_ul[0].find_all("li")

for i in all_li:
    #print(i)
    h1 = i.find("h1").get_text()
    p1 = i.find("p",attrs={"class":"wea"}).get_text()
    p2 = i.find("p",attrs={"class":"tem"})
    tem = p2.find("span").get_text()+"~"+p2.find("i").get_text()
    win = i.find("p",attrs={"class":"win"}).find("i").get_text()
    
    print(h1)
    print(p1)
    print(tem)
    print(win)
    print("=============================")

在这里插入图片描述

案例三:批量下载图片

# -*- coding: utf-8 -*-
"""
Created on Fri Dec 18 14:46:42 2020

@author: 57616
"""

import urllib.request
import urllib.parse
import re
import os
#添加header,其中Referer是必须的,否则会返回403错误,User-Agent是必须的,这样才可以伪装成浏览器进行访问
header=\
{
     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
     "referer":"https://image.baidu.com"
    }
url = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={word}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&word={word}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&cg=girl&pn={pageNum}&rn=30&gsm=1e00000000001e&1490169411926="
keyword = input("请输入搜索关键字:")
#转码
keyword = urllib.parse.quote(keyword,"utf-8")

n = 0
j = 0

while(n<3000):
    error = 0
    n+=30
    url1 = url.format(word = keyword,pageNum=str(n))
    #获取请求
    rep = urllib.request.Request(url1,headers=header)
    #打开网页
    rep = urllib.request.urlopen(rep)
    #获取网页内容
    try:
        html = rep.read().decode("utf-8")
        # print(html)
    except:
        print("出错了!")
        error = 1
        print("出错页数:"+str(n))
    if error == 1:
        continue
    #正则匹配
    p = re.compile(r"thumbURL.*?\.jpg")
    #获取正则匹配到的结果,返回list
    s = p.findall(html)

    if os.path.isdir("D://test_pic") != True:
        os.makedirs("D://test_pic")
    with open("testpic.txt","a") as f:
        #获取图片
        for i in s:
            i = i.replace('thumbURL":"','')
            print(i)
            f.write(i)
            f.write("\n")
            #保存图片
            urllib.request.urlretrieve(i,"D://test_pic/pic.{num}.jpg".format(num=j))
            j+=1
        f.close()
print("总共爬取图片数为:"+str(j))

案例四:flask上传文件【pycharm】

html5上传文件页面

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Flask服务器</title>
</head>
<body>
<h1>文件</h1>
<form action="" enctype="multipart/form-data" method="post">
    <input type="file" name="file">
    <input type="submit" name="上传">
</form>
</body>
</html>

在这里插入图片描述

from flask import Flask,render_template,request,redirect,url_for,send_from_directory
from werkzeug.utils import secure_filename
import os


app = Flask(__name__)
@app.route("/upload",methods=["POST","GET"])


def upload():
    if request.method == "POST":
        f = request.files["file"]
        basepath = os.path.dirname(__file__)#当前文件所在的路径
        upload_path = os.path.join(basepath,"static/upload",secure_filename(f.filename))
        print(upload_path)
        f.save(upload_path)
        return redirect(url_for('upload'))
    return render_template('upload.html')

#启动服务器
if __name__ == '__main__':
    app.run(port=6699,debug=True)
    
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值