requests

最新推荐文章于 2024-11-15 17:52:45 发布

anbingzhong1132

最新推荐文章于 2024-11-15 17:52:45 发布

阅读量117

点赞数

文章标签： json 数据库爬虫

原文链接：http://www.cnblogs.com/yoyo1216/p/10132237.html

版权

本文详细介绍使用Python进行网络爬虫的技术要点，包括利用requests库发送GET和POST请求，处理JSON和HTML数据，通过代理服务器访问网站，以及将爬取的数据存储到MySQL数据库的方法。

摘要由CSDN通过智能技术生成

一，requests发请求

s = requests.Session()
payload = {'key1': 'value1', 'key2': 'value2'}
proxies = {'http': 'http://47.98.163.18:8080', 'https': 'http://47.98.163.18:8080'}
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
requests.get(url, headers=headers, verify = False, params=payload, allow_redirects=False, proxies=proxies).content.decode('utf-8')
# headers 请求头
# data post请求数据
# verify ssl安全认证
# allow_redirects 重定向
# proxies 设置代理
requests.post(url, headers=headers, data=data,verify = False, allow_redirects=False).content.decode('utf-8')
re=requests.post(url, headers=headers, data=data,verify = False)
# 获取cookie
dict_from_cookiejar(re.cookies)

二，requests

import requests

kw = {'wd':'长城'}

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
 


# params 接收一个字典或者字符串的查询参数，字典类型自动转换为url编码，不需要urlencode()
response = requests.get("http://www.baidu.com/s?", params = kw, headers = headers)

# 查看响应内容，response.text 返回的是Unicode格式的数据
print(response.text)

# 查看响应内容，response.content返回的字节流数据
print(respones.content)

# 查看完整url地址
print(response.url)

# 查看响应头部字符编码
print(response.encoding)

# 查看响应码
print(response.status_code)

三，python爬虫POST request payload形式的请求

import requests
import json

payloadHeader = {
    'Host': 'sellercentral.amazon.com',
    'Content-Type': 'application/json',
}
requests.post(postUrl, data=json.dumps(payloadData), headers=payloadHeader)

四，字典格式存数据库,要求数据库字段和字典格式字段一样

class MogujiePipeline(object):
    def __init__(self):
        # 创建数据库连接
        self.db = pymysql.connect(host='localhost', port=3306, database='cfda', user='root', password='root',
                                  charset='utf8')
        # self.db = pymysql.connect(host='rm-bp195i4u0w1066u709o.mysql.rds.aliyuncs.com', port=3306, database='spider58',
        #                           user='spider58',
        #                           password='58spider@123',
        #                           charset='utf8')
        self.cursor = self.db.cursor()

    def process_item(self, item, spider):
        # 判断爬取的字段数据库中是否已经存在
        print(f'select id from mogujie where clientUrl={item["clientUrl"]}')
        num = self.cursor.execute('select id from mogujie where clientUrl="{}"'.format(item["clientUrl"]))
        if not num:
            list_key = []
            list_lalues = []
            for key, lalues in item.items():
                list_key.append(key)
                list_lalues.append("'" + str(lalues).replace("'", "‘") + "'")
                # 拼接sql语句
            insert_sql = 'insert into mogujie({}) values({})'.format(', '.join(list_key),
                                                                     ', '.join(list_lalues))
            print('insert_sql:', insert_sql)
            self.cursor.execute(insert_sql)
            self.db.commit()

        return item

    def close_spider(self, spider):
        # 关闭数据库的连接
        self.cursor.close()
        self.db.close()

五.爬起json数据

import requests
import json
import pymysql
import logging

logging.basicConfig(
    level=logging.INFO,  # 定义输出到文件的log级别，大于此级别的都被输出
    format='%(asctime)s  %(filename)s  %(levelname)s : %(message)s',  # 定义输出log的格式
    datefmt='%Y-%m-%d %H:%M:%S',  # 时间
    filename='yibao.log',  # log文件名
    filemode='a')  # 写入模式“w”或“a”


class yibao(object):
    def __init__(self):
        self.db = pymysql.connect(host='localhost', port=3306, database='cfda', user='root', password='root',
                                  charset='utf8')
        self.cursor = self.db.cursor()
        self.headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", }
        self.url = 'http://code.nhsa.gov.cn:8000/jbzd/public/toStdOperationTreeList.html'

        self.parse_page()

    def parse_page(self):
        data = {
            'operationId': 'icdIds',
        }
        html = requests.post(url=self.url, headers=self.headers, data=data).content.decode('utf-8')
        data_json = json.loads(html)
        for data in data_json:
            num = self.cursor.execute('select id from catalogue where id={}'.format())
            if not num:
                # 插入数据
                self.cursor.execute(
                    'insert into catalogue() values()'.format())
                self.db.commit()

                # 查询数据
                self.cursor.execute("select * from catalogue")
                data = self.cursor.fetchone()
                data = self.cursor.fetchall()

                # 更新数据
                self.cursor.execute("update catalogue set ''='{}', ''='{}' where id={}".format())
                self.db.commit()

                # 删除数据
                self.cursor.execute("delete from catalogue where id={}".format())
                self.db.commit()


if __name__ == '__main__':
    yibao()

六.HTML数据

import requests
import json
import time
import pymysql
import logging
import random
from lxml import etree

logging.basicConfig(
    level=logging.INFO,  # 定义输出到文件的log级别，大于此级别的都被输出
    format='%(asctime)s  %(filename)s  %(levelname)s : %(message)s',  # 定义输出log的格式
    datefmt='%Y-%m-%d %H:%M:%S',  # 时间
    filename='yibao.log',  # log文件名
    filemode='a')  # 写入模式“w”或“a”


class yibao(object):
    def __init__(self):
        self.db = pymysql.connect(host='localhost', port=3306, database='cfda', user='root', password='root',
                                  charset='utf8')
        self.cursor = self.db.cursor()
        self.headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", }
        self.url = 'http://code.nhsa.gov.cn:8000/jbzd/public/toStdOperationTreeList.html'

        self.parse_page()

    def parse_page(self):
        data = {
            'operationId': 'icdIds',
        }
        html = requests.post(url=self.url, headers=self.headers, data=data).content.decode('utf-8')
        etree_html = etree.HTML(html)
        data = etree_html.xpath(
            '//*[@id="classicont"]/div[@class="els-doc-h4"]/a//text() | //div[@class="els-doc-con-left"]/a//text()')
        datas = etree_html.xpath(
            '//*[@id="classicont"]/div[@class="els-doc-h4"]/span//text() | //div[@class="els-doc-con-left"]/span//text()')
        for i in range(len(data)):
            num = self.cursor.execute('select id from catalogue where id={}'.format())
            if not num:
                # 插入数据
                self.cursor.execute(
                    'insert into catalogue() values()'.format())
                self.db.commit()

                # 查询数据
                self.cursor.execute("select * from catalogue")
                data = self.cursor.fetchone()
                data = self.cursor.fetchall()

                # 更新数据
                self.cursor.execute("update catalogue set ''='{}', ''='{}' where id={}".format())
                self.db.commit()

                # 删除数据
                self.cursor.execute("delete from catalogue where id={}".format())
                self.db.commit()


if __name__ == '__main__':
    yibao()

七.使用代理

proxies = {
    "http": "http://ip:端口号",
    "https": "https://ip:端口号",
}
request.get(url, proxies=proxies)

proxies = {
    "http": "http://username:password@ip:端口号",
    "https": "https://username:password@ip:端口号",
}
request.get(url, proxies=proxies)