爬虫学习笔记--02

一、urllib库转换参数、发送请求

from urllib import request,parse params = parse.urlencode(params_dict) # 将字典(或元组——每个元素是包含两个元素的小元组)转换为字符串格式的查询参数 req = request.Request(url,data=params.encode()) # 创建Request对象,封装URL地址和请求参数 注意:data参数必须是字节数据,如果传递data参数,则默认是POST请求。

response = request.urlopen(req) # urlopen()可以接收URL字符串或者Request对象

1.1 百度翻译爬取

代码演示

from urllib import request,parse
​
url = 'https://fanyi.baidu.com/sug'   # 百度翻译接口
​
params_dict = {
    'kw':'banana',
}
​
params = parse.urlencode(params_dict)   # 将字典(或元组——每个元素是包含两个元素的小元组)转换为字符串格式的查询参数
print("转换后的查询字符串参数:"+params)
req = request.Request(url,data=params.encode()) #data不传默认为get请求,传参默认为post请求
​
response = request.urlopen(req)
if response.status == 200:
    print("恭喜,请求成功!")
    print(response.read().decode())
​
print("***********以下是requests库的解法*************")
import requests
​
post_response = requests.post(url,data=params_dict)
if post_response.status_code == 200:
    print("requests库post请求方式成功~~~")
    print(post_response.text)
from urllib import request,parse
​
url = 'https://fanyi.baidu.com/v2transapi'
​
params_dict = {
    'from': 'en',
    'to': 'zh',
    'query': 'banana',
    'transtype': 'realtime',
    'simple_means_flag': '3',
    'sign': '594751.913422',
    'token': '4d0a1a6ebe0a184c8c44bb5d3477330b',
}
​
headers = {
   'Host': 'fanyi.baidu.com',
   'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
   'Cookie': 'BIDUPSID=8CDE36310DA13D77BA6F2F1EFCCDAD60; PSTM=1539593553; BAIDUID=8CDE36310DA13D77BA6F2F1EFCCDAD60:SL=0:NR=10:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; H_PS_PSSID=26522_1424_21094_27377; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1540992599,1541036898; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1541041590; delPer=0; PSINO=1; locale=zh; to_lang_often=%5B%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; from_lang_often=%5B%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%2C%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%5D'
}
​
params = parse.urlencode(params_dict)
req = request.Request(url,data=params.encode(),headers=headers)
response = request.urlopen(req)
​
if response.status == 200:
    print("请求成功!")
    print(response.read().decode())

 

二、替换原URL中含有%的部分

from urllib import request real_url = request.unquote("含有%的url") # 将url中的%替换为原本的字符

2.1 豆瓣电影爬取

代码演示

动态抓包

from urllib import request,parse
​
# raw_url = 'https://movie.douban.com/j/chart/top_list'
# real_url = request.unquote(raw_url)  # 将url中的%替换为原本的字符
​
def spider_movie_rating(category,start,limit=20):
    params_dict = {
        'type':category,
        'interval_id':'100:90',
        'action':'',
        'start':start,
        'limit':limit
    }
​
    params = parse.urlencode(params_dict)  # 将字典转换为查询字符串
    url = 'https://movie.douban.com/j/chart/top_list'
    url += "?"+params
    req = request.Request(url)
    response = request.urlopen(req)
    m = (response.read().decode('utf-8'))
    if response.status == 200:
        with open('movie_rating_%d.txt'%start,'wb') as f:
            f.write(m.encode())
        print("存储成功")
if __name__ == '__main__':
    for i in range(5):
        spider_movie_rating(23,20*i)

爬取每条电影中对应的图片

import json
from urllib import request
​
with open('movie_rating_0.txt','r') as f:
    movie_list = json.load(f)
    for movie in movie_list:
        print()
        imgpath = movie['cover_url']
        request.urlretrieve(imgpath,'./images/'+imgpath.split('/')[-1])
        print(imgpath.split('/')[-1]+"保存成功!")

 

三、BeautifulSoup的其他用法:

1. 可以与正则表达式联合使用 import re soup = BeautifulSoup(要解析的文本,'lxml') soup.find_all(re.compile("正则表达式"))

  1. 可以通过CSS选择器获取标签soup.select("CSS选择器")

3.1 搜狐首页爬取获取制定标签内容

代码演示

from bs4 import BeautifulSoup
import requests
import re
​
# url = 'http://www.sohu.com'
# r = requests.get(url)
# with open('sohu.html','wb') as f:
#     f.write(r.text.encode())
 #注释部分为爬取网页部分
    
    
    
 #BeautifulSoup的使用
with open('sohu.html','rb') as f:
    html = f.read().decode()
    soup = BeautifulSoup(html,'lxml')
    for tag in soup.find_all(re.compile("^m")):  # 获取以m开头的标签
        print(tag)
​
    print(soup.select("div ul"))   # 获取所有div下的ul标签,以列表形式返回
    print(soup.select("div#sohuTopc > div.ph-link"))
​

四、自定义Django项目实现Ajax传参

代码演示

views

from django.http import JsonResponse
from django.shortcuts import render
​
def show_fruits(request):
    fruits = ["苹果","香蕉","梨","橘子"]
    return render(request,'fruits.html',locals())
​
def show_tasted_fruits(request):
    tasted_fruits = ["好吃的火龙果","好吃的菠萝","可口的猕猴桃"]
    return JsonResponse({'tasted_fruits':tasted_fruits}) #必须传字典
​

templates

<!DOCTYPE html>
{% load static %}
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>水果信息</title>
    <script type="text/javascript" src="{% static 'js/jquery-3.1.1.js' %}"></script>
    <script type="text/javascript" src="{% static 'js/fruit.js' %}"></script>
</head>
<body>
   <ul>
      {% for fruit in fruits %}
          <li> {{ fruit }} </li>
      {% endfor %}
   </ul>
   <div id="tasted" style="color:red">
​
   </div>
</body>
</html>
​
<!---------------------------------------------------------------------------------->
<!js/fruit.js>
$(function(){
            $.ajax({
                url:'/myapp/tasted/',
                type:'GET',
                success:function(data){    //回调函数
                    var tasted_fruits = data["tasted_fruits"];
                    for(var key in tasted_fruits){
                        $("div#tasted").append(tasted_fruits[key]+"<br/>");
                    }
                    alert('success');
                }
            })
})

urls

#总路由
from django.contrib import admin
from django.urls import path,include
​
urlpatterns = [
    path('admin/', admin.site.urls),
    path('myapp/',include('myapp.urls')),
]
​
​
#子路由
from django.urls import path
​
from myapp.views import *
​
urlpatterns = [
    path('fruits/',show_fruits),
    path('tasted/',show_tasted_fruits)
]

五、homework

5.1 利用天气API 爬取天气情况并保存

from urllib import request
import json
​
url = 'http://www.weather.com.cn/data/sk/101110101.html'
url1 = "http://www.weather.com.cn/data/sk/101110103.html"
​
req = request.Request(url1)
response = request.urlopen(req)
if response.status == 200:
    weather_str = response.read().decode()
    weather_dict = json.loads(weather_str)
    print(weather_dict)
    with open('weather.txt','w',encoding='utf-8') as f:
        info_dict = weather_dict["weatherinfo"]
        city = info_dict["city"]
        temperature = info_dict["temp"]
        wind_direction = info_dict["WD"]
        f.write("城市:"+city+"\n")
        f.write("温度:" + temperature+"\n")
        f.write("风向:" + wind_direction+"\n")
        print("天气信息保存成功!")

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值