一,requests发请求
s = requests.Session() payload = {'key1': 'value1', 'key2': 'value2'} proxies = {'http': 'http://47.98.163.18:8080', 'https': 'http://47.98.163.18:8080'} headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} requests.get(url, headers=headers, verify = False, params=payload, allow_redirects=False, proxies=proxies).content.decode('utf-8') # headers 请求头 # data post请求数据 # verify ssl安全认证 # allow_redirects 重定向 # proxies 设置代理 requests.post(url, headers=headers, data=data,verify = False, allow_redirects=False).content.decode('utf-8') re=requests.post(url, headers=headers, data=data,verify = False) # 获取cookie dict_from_cookiejar(re.cookies)
二,requests
import requests
kw = {'wd':'长城'}
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
# params 接收一个字典或者字符串的查询参数,字典类型自动转换为url编码,不需要urlencode()
response = requests.get("http://www.baidu.com/s?", params = kw, headers = headers)
# 查看响应内容,response.text 返回的是Unicode格式的数据
print(response.text)
# 查看响应内容,response.content返回的字节流数据
print(respones.content)
# 查看完整url地址
print(response.url)
# 查看响应头部字符编码
print(response.encoding)
# 查看响应码
print(response.status_code)
三,python爬虫POST request payload形式的请求
import requests
import json
payloadHeader = {
'Host': 'sellercentral.amazon.com',
'Content-Type': 'application/json',
}
requests.post(postUrl, data=json.dumps(payloadData), headers=payloadHeader)
四,字典格式存数据库,要求数据库字段和字典格式字段一样
class MogujiePipeline(object):
def __init__(self):
# 创建数据库连接
self.db = pymysql.connect(host='localhost', port=3306, database='cfda', user='root', password='root',
charset='utf8')
# self.db = pymysql.connect(host='rm-bp195i4u0w1066u709o.mysql.rds.aliyuncs.com', port=3306, database='spider58',
# user='spider58',
# password='58spider@123',
# charset='utf8')
self.cursor = self.db.cursor()
def process_item(self, item, spider):
# 判断爬取的字段数据库中是否已经存在
print(f'select id from mogujie where clientUrl={item["clientUrl"]}')
num = self.cursor.execute('select id from mogujie where clientUrl="{}"'.format(item["clientUrl"]))
if not num:
list_key = []
list_lalues = []
for key, lalues in item.items():
list_key.append(key)
list_lalues.append("'" + str(lalues).replace("'", "‘") + "'")
# 拼接sql语句
insert_sql = 'insert into mogujie({}) values({})'.format(', '.join(list_key),
', '.join(list_lalues))
print('insert_sql:', insert_sql)
self.cursor.execute(insert_sql)
self.db.commit()
return item
def close_spider(self, spider):
# 关闭数据库的连接
self.cursor.close()
self.db.close()
五.爬起json数据
import requests
import json
import pymysql
import logging
logging.basicConfig(
level=logging.INFO, # 定义输出到文件的log级别,大于此级别的都被输出
format='%(asctime)s %(filename)s %(levelname)s : %(message)s', # 定义输出log的格式
datefmt='%Y-%m-%d %H:%M:%S', # 时间
filename='yibao.log', # log文件名
filemode='a') # 写入模式“w”或“a”
class yibao(object):
def __init__(self):
self.db = pymysql.connect(host='localhost', port=3306, database='cfda', user='root', password='root',
charset='utf8')
self.cursor = self.db.cursor()
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", }
self.url = 'http://code.nhsa.gov.cn:8000/jbzd/public/toStdOperationTreeList.html'
self.parse_page()
def parse_page(self):
data = {
'operationId': 'icdIds',
}
html = requests.post(url=self.url, headers=self.headers, data=data).content.decode('utf-8')
data_json = json.loads(html)
for data in data_json:
num = self.cursor.execute('select id from catalogue where id={}'.format())
if not num:
# 插入数据
self.cursor.execute(
'insert into catalogue() values()'.format())
self.db.commit()
# 查询数据
self.cursor.execute("select * from catalogue")
data = self.cursor.fetchone()
data = self.cursor.fetchall()
# 更新数据
self.cursor.execute("update catalogue set ''='{}', ''='{}' where id={}".format())
self.db.commit()
# 删除数据
self.cursor.execute("delete from catalogue where id={}".format())
self.db.commit()
if __name__ == '__main__':
yibao()
六.HTML数据
import requests
import json
import time
import pymysql
import logging
import random
from lxml import etree
logging.basicConfig(
level=logging.INFO, # 定义输出到文件的log级别,大于此级别的都被输出
format='%(asctime)s %(filename)s %(levelname)s : %(message)s', # 定义输出log的格式
datefmt='%Y-%m-%d %H:%M:%S', # 时间
filename='yibao.log', # log文件名
filemode='a') # 写入模式“w”或“a”
class yibao(object):
def __init__(self):
self.db = pymysql.connect(host='localhost', port=3306, database='cfda', user='root', password='root',
charset='utf8')
self.cursor = self.db.cursor()
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", }
self.url = 'http://code.nhsa.gov.cn:8000/jbzd/public/toStdOperationTreeList.html'
self.parse_page()
def parse_page(self):
data = {
'operationId': 'icdIds',
}
html = requests.post(url=self.url, headers=self.headers, data=data).content.decode('utf-8')
etree_html = etree.HTML(html)
data = etree_html.xpath(
'//*[@id="classicont"]/div[@class="els-doc-h4"]/a//text() | //div[@class="els-doc-con-left"]/a//text()')
datas = etree_html.xpath(
'//*[@id="classicont"]/div[@class="els-doc-h4"]/span//text() | //div[@class="els-doc-con-left"]/span//text()')
for i in range(len(data)):
num = self.cursor.execute('select id from catalogue where id={}'.format())
if not num:
# 插入数据
self.cursor.execute(
'insert into catalogue() values()'.format())
self.db.commit()
# 查询数据
self.cursor.execute("select * from catalogue")
data = self.cursor.fetchone()
data = self.cursor.fetchall()
# 更新数据
self.cursor.execute("update catalogue set ''='{}', ''='{}' where id={}".format())
self.db.commit()
# 删除数据
self.cursor.execute("delete from catalogue where id={}".format())
self.db.commit()
if __name__ == '__main__':
yibao()
七.使用代理
proxies = {
"http": "http://ip:端口号",
"https": "https://ip:端口号",
}
request.get(url, proxies=proxies)
proxies = {
"http": "http://username:password@ip:端口号",
"https": "https://username:password@ip:端口号",
}
request.get(url, proxies=proxies)
.replace("'", "’")