爬虫技术：爬取今日头条数据-崔庆才思路

最新推荐文章于 2024-08-18 15:31:51 发布

bangqin0414

最新推荐文章于 2024-08-18 15:31:51 发布

阅读量1.5k

点赞数

文章标签：爬虫 java c#

原文链接：http://www.cnblogs.com/meloncodezhang/p/11551139.html

版权

爬虫技术：爬取今日头条数据-崔庆才思路

一. urllib库中将字典转化为url的查询参数

二.请求异常的处理，以及内部的判断逻辑

　　1.返回的json数据为空：原因是requests的请求对象没有加请求头和cookies

import requests
from urllib.parse import urlencode
def get_page_index():
    data = {
    "aid": "24",
    "app_name": "web_search",
    "offset": "0",
    "format": "json",
    "keyword": "街拍",
    "autoload": "true",
    "count": "20",
    "en_qc": "1",
    "cur_tab": "1",
    "from": "search_tab",
    "pd": "synthesis",
    "timestamp": "1568883030289"
    }

    url = "https://www.toutiao.com/api/search/content/?" + urlencode(data)
    response = requests.get(url)
    if response.status_code == 200:
        print(response.text)
if __name__ == '__main__':
     get_page_index()
# 结果：
{"count":0,"return_count":0,"query_id":"6537385837821170952","has_more":0,"request_id":"20190919170154010017090029827CF0A","search_id":"20190919170154010017090029827CF0A","cur_ts":1568883714,"offset":20,"message":"success","pd":"synthesis","show_tabs":1,"keyword":"街拍","city":"西安","log_pb":{"impr_id":"20190919170154010017090029827CF0A"},"data":null,"data_head":[{"challenge_code":1366,"cell_type":71,"keyword":"街拍","url":"sslocal://search?keyword=%E8%A1%97%E6%8B%8D\u0026from=\u0026source=search_tab"}],"ab_fields":null,"latency":0,"search_type":2,"tab_rank":null}

　　2.正常获得数据

import requests
from urllib.parse import urlencode
def get_page_index():
    data = {
    "aid": "24",
    "app_name": "web_search",
    "offset": "0",
    "format": "json",
    "keyword": "街拍",
    "autoload": "true",
    "count": "20",
    "en_qc": "1",
    "cur_tab": "1",
    "from": "search_tab",
    "pd": "synthesis",
    "timestamp": "1568883030289"
    }

    url = "https://www.toutiao.com/api/search/content/?" + urlencode(data)
    response = requests.get(url,headers=headers,cookies=cookies)
    if response.status_code == 200:
        print(response.content.decode("utf-8"))
if __name__ == '__main__':
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"}
    cookies = {"Cookie": "tt_webid=6719272225969096196; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6719272225969096196; csrftoken=b28e41c77cd4f268af393de7d3e9d47a; UM_distinctid=16c4159a9ae7e3-04be696c185f6c-3f385c06-1fa400-16c4159a9afa94; CNZZDATA1259612802=1303724616-1564459685-https%253A%252F%252Fwww.toutiao.com%252F%7C1564459685; WIN_WH=1536_710; s_v_web_id=e588fb5c6570d79a16b67e84decce3d8; __tasessionId=y99fyeyyt1568882979794"}
    get_page_index()

# 结果：
{"count":20,"return_count":20,"query_id":"6537385837821170952","has_more":1,"request_id":"20190919170856010017031149086E0FC","search_id":"20190919170856010017031149086E0FC","cur_ts":1568884136,"offset":20,"message":"success","pd":"synthesis","show_tabs":1,"keyword":"街拍","city":"西安","tokens":["街拍"],"log_pb":{"impr_id":"20190919170856010017031149086E0FC"},"data":[{"ala_src":"user","app_info":{"query_type":"AladdinRpcQueryType"},"cell_type。。。。。。。。。。。。省略

四：

图片地址位置定位：要现请求这个网址，获得相应解析出对应的imag_url

　　解析报错：SyntaxError: Non-UTF-8 code starting with '\xe5'，在程序上方添加 # -*- coding:utf-8 -*-

　　json中的键值对，期望用双引号而不是单引号。原因是正则错误：

五：完整的代码

# -*- coding:utf-8 -*-
import re
import requests
from urllib.parse import urlencode
import os
from requests.exceptions import RequestException
import json
import pymongo
from bs4 import BeautifulSoup
from config import *
from hashlib import md5

# 建立数据库的链接对象
client = pymongo.MongoClient(MONGO_URL)
# 数据库的名称
db = client[MONGO_DB]


def get_page_index(offset, keyword):
    data = {
        "aid": "24",
        "app_name": "web_search",
        "offset": offset,
        "format": "json",
        "keyword": keyword,
        "autoload": "true",
        "count": "20",
        "en_qc": "1",
        "cur_tab": "1",
        "from": "search_tab",
        "pd": "synthesis",
        "timestamp": "1568883030289"
    }

    url = "https://www.toutiao.com/api/search/content/?" + urlencode(data) # 知识点1：urlencode()将字典数据，{"a":"1","b":"2"}----> a=1,b=2
    try:
        response = requests.get(url, headers=headers, cookies=cookies)
        if response.status_code == 200:
            content = response.content.decode()
            return content
        return None
    except RequestException:  # 知识点2：所有请求异常类的捕获
        print("请求出错")
        return None


def parse_page_index(html):
    """构造生成器即可，或者这个函数的返回值是一个列表"""
    data = json.loads(html)
    if data and "data" in data.keys():
        for item in data.get("data"):  # 知识点3:字典获取键的值的get方法
            if "article_url" in item.keys():
                url = item.get("article_url")
                yield url


def get_page_detial(url):
    try:  # 知识点4：请求的异常处理方式
        response = requests.get(url, headers=headers, cookies=cookies)
        if response.status_code == 200:
            content = response.content.decode()
            return content
        return None
    except RequestException:  
        print("请求出错")
        return None


def parse_page_detial(html, url):
    """正则获取gallery"""
    soup = BeautifulSoup(html, "lxml")
    title = soup.select("title")[0].get_text()  # 知识点5：soup的选择器使用
    images_pattern = re.compile('gallery: JSON.parse\("(.*?)"\),', re.S)  # 知识点6：正则模式re.S模式
    result = re.search(images_pattern, html)
    if result:
        ret = result.group(1)
        # {\"count\":11,\"sub_images\":[{\"url\":\"http:\\\u002F\\\u002Fp3.pstatp.com\\...}
        # 在进行loads转换时，报错json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
        # 因此需要替换\为空字符串
        ret = ret.replace("\\", "")
        ret = ret.replace("u002F", "/")
        data = json.loads(ret)
        if data and 'sub_images' in data.keys():
            sub_images = data.get("sub_images")
            images = [item.get("url") for item in sub_images]
            for img in images:
                download(img)
            return {
                "title": title,
                "images": images,
                "url": url
            }

def save_to_mongo(ret_dict):
    if db[MONGO_TABLE].insert(ret_dict): # 知识点8：mongodb数据库的链接，配置文件方式传入
        print("插入数据到数据库成功", ret_dict["title"])
        return True
    return False


def download(url):
    print("正在下载图片",url)
    try:
        response = requests.get(url, headers=headers, cookies=cookies)
        if response.status_code == 200:
            content = response.content
            saveimg(content)
        return None
    except RequestException:
        print("请求出错")
        return None

def saveimg(content):
    file_path = "{0}/{1}.{2}".format(os.getcwd(),md5(content).hexdigest(),"jpg")  # 知识点9：运用md5进行去重，md5的简单回顾
    if not os.path.exists(file_path):  # 知识点10:os方法的使用
        with open(file_path,"wb") as f:
            f.write(content)


def main():
    for offset in range(START_PAGE,END_PAGE,20):
        keyword = "街拍"
        html = get_page_index(offset, keyword)
        if html:
            for url in parse_page_index(html):
                html = get_page_detial(url)
                if html:
                    ret = parse_page_detial(html, url)
                    if ret:
                        save_to_mongo(ret)


if __name__ == '__main__':
    headers = {
        "User-Agent": "xx"}
    cookies = {
        "Cookie": "xx"}
    main()

试运行爬取所有的街拍：报错json.decoder.JSONDecodeError，因此代码还得进行优化，排除异常。

六：知识点总结

urlencode是从urllib.parse中的一个方法：将字典变成url的查询参数

from urllib.parse import urlencode

data = {"a":1,"b":2}

url = "http:www.baidu.com/?"

print(url + urlencode(data))

http:www.baidu.com/?a=1&b=2

md5加密的不一致问题

一直以来都是用 hashlib中的md5进行加密，md5.update(二进制) md5.hexdigest()，可以会出现对相同的字符串进行加密，加密结果不一样的问题，看来是update方法造成的。

from hashlib import md5
fp = md5()

demo = ["1","1","3","3"]

for i in demo:
    fp.update(i.encode("utf-8"))
    print(fp.hexdigest())

# 结果：

c4ca4238a0b923820dcc509a6f75849b
6512bd43d9caa6e02c990b0a82652dca
73278a4a86960eeb576a8fd4c9ec6997
fd06b8ea02fe5b1c2496fe1700e9d16c

# 原因是md5.updage（）会将上次的串和这次的进行拼接，1,11,113，1133，每次加密的串都不同，结果肯定不同。
所以每加密之前，都对md5进行实例化，才能保证相同内容加密结果一样，因为以前这个方法都是放在函数里面的，每次调用函数，都会重新实例化md5,因此不存在问题。循环就存在问题
上面代码可以改为

from hashlib import md5

demo = ["1","1","3","3"]

for i in demo:
　　fp = md5()
　　fp.update(i.encode("utf-8"))
　　print(fp.hexdigest())

# 结果为：

c4ca4238a0b923820dcc509a6f75849b
c4ca4238a0b923820dcc509a6f75849b
eccbc87e4b5ce2fe28308fd9f2a7baf3
eccbc87e4b5ce2fe28308fd9f2a7baf3

for i in demo:
    print(md5(i.encode("utf-8")).hexdigest())  # 这种方式行，因为每次都重新实例化了

# 结果

c4ca4238a0b923820dcc509a6f75849b
c4ca4238a0b923820dcc509a6f75849b
eccbc87e4b5ce2fe28308fd9f2a7baf3
eccbc87e4b5ce2fe28308fd9f2a7baf3

# 看源码也没有理解update真正意图，只是说用字符串更新对象。 后续解决

os模块的使用方法

os的基本用法

1. os.getcwd()：查看当前所在路径。

current_path = os.getcwd()
print(current_path)

# 运行结果
C:\Users\Administrator\AppData\Roaming\Sublime Text 3\Packages\User


2. os.listdir(path):列举目录下的所有文件。返回的是列表类型。

dir_list = os.listdir(current)

print(dir_list)

# 运行结果

['11.py', 'cuiqingcai.py', 'Localization.sublime-settings', 'oscrypto-ca-bundle.crt', 'Package Control.cache', 'Package Control.last-run', 'Package Control.merged-ca-bundle', 'Package Control.sublime-settings', 'Package Control.user-ca-bundle', 'Preferences.sublime-settings', 'reids分布式锁', 'sha1.py', 'test.py', 'untitled.sublime-build']

具体用法见：https://www.cnblogs.com/yufeihlf/p/6179547.html

Mongo数据库与python的交互

import pyongo  # 交互模块

# 第一步，建立客户端，链接mogo服务器,ip和port

 from pymongo import MongoClient
 client = MongoClient(host,port)
 collection = client[db名][集合名]   # db名--相当于数据库的名称 集合名---相当于表名称
  
# 第二步，添加数据

ret = collection.insert_one({"name":"test10010","age":33})
 print(ret) # 通过返回的数据进行判断
if ret:
　　xxxx

示例：

import pymongo

client = pymongo.MongoClient("localhost")

# 链接指定数据库中的指定集合，不存在就新建

collection = client["test"]["new"]

ret = collection.insert({"new":"python"})

print(ret)

# 结果：
5d85ce978a808f42364b045c

插入前：


插入后：

正则表达式知识点回顾：

import re

pattern = re.compile("匹配规则", re.S)

re.compile（） 返回的就是一个匹配规则。陪着search find match等方法使用

import  re

a = """aaaaaaabbbbbbbb
111111ccccc"""

pattern1 = re.compile("aaaaaaa(.*?)cccc")
print(re.search(pattern1,a))

# None  re.S可以匹配全部文本，不担心换行问题

pattern2 = pattern1 = re.compile("aaaaaaa(.*?)cccc",re.S)
print(re.search(pattern2,a))

# <re.Match object; span=(0, 26), match='aaaaaaabbbbbbbb\n111111cccc'>