Windows 10/11上自动安装配置Hadoop 3.1.3的Python代码

weixin_30777913
已于 2024-01-28 12:31:42 修改
阅读量388
点赞数 7
文章标签： windows hadoop python
于 2024-01-27 22:39:46 首次发布
本文链接：https://blog.csdn.net/weixin_30777913/article/details/135889740
版权
import os
import subprocess
import sys
import xml.etree.ElementTree as ET
from lxml import etree
from shutil import copyfile, move

version = '3.1.3'
hadoop = f'hadoop-{version}'
hive_version = '3.1.2'
path = 'C:\\'
directory_path = path + hadoop
java_path = 'C:\\Program Files\\Java\\jre-1.8'


def jre_8_install():
    try:
        result = subprocess.run(['java', '-version'], stderr=subprocess.PIPE, universal_newlines=True)
        if not result.stderr.startswith('java version'):
            print(result.stderr)
            process = subprocess.run(['winget','install','Oracle.JavaRuntimeEnvironment'], shell=True, check=True)
    except:
        process = subprocess.run(['winget','install','Oracle.JavaRuntimeEnvironment'], shell=True, check=True)  
    print('JRE 8 install successfully')
    java_home = os.environ.get('JAVA_HOME') 
    if java_home != java_path:
        process = subprocess.run(['setx','JAVA_HOME',java_path], shell=True, check=True)
    print('JAVA_HOME environment variable is set')

        
def set_hadoop_home():
    hadoop_home = os.environ.get('HADOOP_HOME')  
    if hadoop_home == directory_path:
        print('HADOOP_HOME environment variable is set')
    else:
        process = subprocess.run(['setx','HADOOP_HOME',directory_path], shell=True, check=True)
        print('HADOOP_HOME environment variable is set')
        try:
            result = subprocess.run(['hadoop', '--version'], stderr=subprocess.PIPE, universal_newlines=True)
            if result.stderr.startswith('java'):
                print('Path environment variable is set')
            else:
                print(result.stderr)
        except:
            path = os.environ.get('Path')
            if not rf'{directory_path}\bin' in path:
                hadoop_path = rf'{path.replace(";;",";")};%HADOOP_HOME%\bin'
                process = subprocess.run(['setx','Path', hadoop_path], shell=True, check=True)
            if not rf'{directory_path}\sbin' in path:
                hadoop_path = rf'{path.replace(";;",";")};%HADOOP_HOME%\sbin'
                print(f'Path=\n{hadoop_path}')
                process = subprocess.run(['setx','Path', hadoop_path], shell=True, check=True)
            print('Path environment variable is set')

        
def set_java_library_path():
    java_library_path = os.environ.get('JAVA_LIBRARY_PATH')
    if java_library_path != rf'{directory_path}\lib\native':
        process = subprocess.run(['setx','JAVA_LIBRARY_PATH',rf'{directory_path}\lib\native'], shell=True, check=True)
    print('JAVA_LIBRARY_PATH environment variable is set')

        
def set_hadoop_common_lib_native_dir():
    java_library_path = os.environ.get('HADOOP_COMMON_LIB_NATIVE_DIR')
    if java_library_path != rf'{directory_path}\lib\native':
        process = subprocess.run(['setx','HADOOP_COMMON_LIB_NATIVE_DIR',r'%HADOOP_HOME%\lib\native'], shell=True, check=True)
    print('HADOOP_COMMON_LIB_NATIVE_DIR environment variable is set')


def create_directories(base_dir):

    # 指定data目录并检查是否存在，如果不存在则创建  
    data_dir = os.path.join(base_dir, "data")  
    if not os.path.exists(data_dir):  
        os.makedirs(data_dir)  

    # 指定并检查namenode和datanode目录是否存在，如果不存在则创建  
    namenode_dir = os.path.join(data_dir, "namenode")  
    datanode_dir = os.path.join(data_dir, "datanode")  

    for dir_path in [namenode_dir, datanode_dir]:  
        if not os.path.exists(dir_path):  
            os.makedirs(dir_path)

    # 指定tmp目录并检查是否存在，如果不存在则创建  
    tmp_dir = os.path.join(base_dir, "tmp")  
    if not os.path.exists(tmp_dir):  
        os.makedirs(tmp_dir)  
    print("Directories namenode and datanode are created")


def update_core_site_xml(base_dir):
    # XML文件路径  
    file_path = rf'{base_dir}\etc\hadoop\core-site.xml'  

    # 检查文件是否存在  
    if os.path.exists(file_path):  
        # 解析XML文件  
        tree = ET.parse(file_path)  
        root = tree.getroot()  

        # 检查根节点是否为configuration  
        if root.tag != 'configuration':  
            # 如果不是，则创建一个新的configuration节点，并将原有内容作为其子节点  
            new_root = ET.Element('configuration')  
            new_root.append(root)  
            tree._setroot(new_root)  
            root = new_root  

        # 检查是否有property节点  
        property_node = root.find('property')  
        if not property_node:  
            # 创建新的property节点  
            property_node = ET.SubElement(root, 'property')  

        if len(property_node) == 0:  

            # 创建name和value节点  
            name_node = ET.SubElement(property_node, 'name')  
            name_node.text = 'fs.defaultFS'  

            value_node = ET.SubElement(property_node, 'value')  
            value_node.text = 'hdfs://localhost:9000'  

            # 将修改写回文件  
            tree.write(file_path, encoding='utf-8', xml_declaration=True)  

            # 加载XML文件  
            parser = etree.XMLParser(remove_blank_text=True)  
            tree = etree.parse(file_path, parser)  
            root = tree.getroot()  

            # 检查根节点是否为configuration  
            if root.tag != 'configuration':  
                # 如果不是，则可能需要更复杂的处理，这里假设它就是  
                raise ValueError("Expected root element to be 'configuration'")  

            # 创建新的注释和样式表声明节点  
            xml_stylesheet = etree.ProcessingInstruction('xml-stylesheet', 'type="text/xsl" href="configuration.xsl"')  
            comment = etree.Comment('\n  Licensed under the Apache License, Version 2.0 (the "License");\n'  
                                    '  you may not use this file except in compliance with the License.\n'  
                                    '  You may obtain a copy of the License at\n'  
                                    '\n'  
                                    '    http://www.apache.org/licenses/LICENSE-2.0\n'  
                                    '\n'  
                                    '  Unless required by applicable law or agreed to in writing, software\n'  
                                    '  distributed under the License is distributed on an "AS IS" BASIS,\n'  
                                    '  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n'  
                                    '  See the License for the specific language governing permissions and\n'  
                                    '  limitations under the License. See accompanying LICENSE file.\n')  

            # 在configuration节点之前插入新的节点  
            root.addprevious(xml_stylesheet)  
            root.addprevious(comment)  

            # 为了保持格式化输出，我们使用lxml的pretty_print函数  
            # 首先，我们需要创建一个新的XML树，因为不能直接修改原文件  
            new_tree = etree.ElementTree(root)  

            # 格式化并写入文件  
            with open(file_path, 'wb') as f:  
                new_tree.write(f, pretty_print=True, xml_declaration=True, encoding='utf-8')
        print("core-site.xml updated")
        
    else:  
        print(f"{file_path} does not exist")


def update_hdfs_site_xml(base_dir):
    # XML文件路径  
    file_path = rf'{base_dir}\etc\hadoop\hdfs-site.xml'  

    # 检查文件是否存在  
    if os.path.exists(file_path):  
        # 解析XML文件  
        tree = ET.parse(file_path)  
        root = tree.getroot()  

        # 检查根节点是否为configuration  
        if root.tag != 'configuration':  
            # 如果不是，则创建一个新的configuration节点，并将原有内容作为其子节点  
            new_root = ET.Element('configuration')  
            new_root.append(root)  
            tree._setroot(new_root)  
            root = new_root  

        # 检查是否有property节点  
        property_node = root.find('property')  
        if not property_node:  
            # 创建新的property节点  
            property_node = ET.SubElement(root, 'property')  

        if len(property_node) == 0:

            # 创建name和value节点  
            name_node = ET.SubElement(property_node, 'name')
            name_node.text = 'dfs.replication'  

            value_node = ET.SubElement(property_node, 'value')
            value_node.text = '1'  
            
            # 创建新的property节点  
            property_node2 = ET.SubElement(root, 'property')

            # 创建name和value节点  
            name_node = ET.SubElement(property_node2, 'name')  
            name_node.text = 'dfs.namenode.name.dir'  

            value_node = ET.SubElement(property_node2, 'value')  
            value_node.text = f'/{path[:-1]}/{hadoop}/data/namenode'
            
            # 创建新的property节点  
            property_node3 = ET.SubElement(root, 'property')

            # 创建name和value节点  
            name_node = ET.SubElement(property_node3, 'name')  
            name_node.text = 'dfs.datanode.data.dir'  

            value_node = ET.SubElement(property_node3, 'value')  
            value_node.text = f'/{path[:-1]}/{hadoop}/data/datanode'

            # 将修改写回文件  
            tree.write(file_path, encoding='utf-8', xml_declaration=True)  

            # 加载XML文件  
            parser = etree.XMLParser(remove_blank_text=True)  
            tree = etree.parse(file_path, parser)  
            root = tree.getroot()  

            # 检查根节点是否为configuration  
            if root.tag != 'configuration':  
                # 如果不是，则可能需要更复杂的处理，这里假设它就是  
                raise ValueError("Expected root element to be 'configuration'")  

            # 创建新的注释和样式表声明节点  
            xml_stylesheet = etree.ProcessingInstruction('xml-stylesheet', 'type="text/xsl" href="configuration.xsl"')  
            comment = etree.Comment('\n  Licensed under the Apache License, Version 2.0 (the "License");\n'  
                                    '  you may not use this file except in compliance with the License.\n'  
                                    '  You may obtain a copy of the License at\n'  
                                    '\n'  
                                    '    http://www.apache.org/licenses/LICENSE-2.0\n'  
                                    '\n'  
                                    '  Unless required by applicable law or agreed to in writing, software\n'  
                                    '  distributed under the License is distributed on an "AS IS" BASIS,\n'  
                                    '  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n'  
                                    '  See the License for the specific language governing permissions and\n'  
                                    '  limitations under the License. See accompanying LICENSE file.\n')  

            # 在configuration节点之前插入新的节点  
            root.addprevious(xml_stylesheet)  
            root.addprevious(comment)  

            # 为了保持格式化输出，我们使用lxml的pretty_print函数  
            # 首先，我们需要创建一个新的XML树，因为不能直接修改原文件  
            new_tree = etree.ElementTree(root)  

            # 格式化并写入文件  
            with open(file_path, 'wb') as f:  
                new_tree.write(f, pretty_print=True, xml_declaration=True, encoding='utf-8')
        print("hdfs-site.xml updated")
        
    else:  
        print(f"{file_path} does not exist")


def update_mapred_site_xml(base_dir):
    # XML文件路径  
    file_path = rf'{base_dir}\etc\hadoop\mapred-site.xml'  

    # 检查文件是否存在  
    if os.path.exists(file_path):  
        # 解析XML文件  
        tree = ET.parse(file_path)  
        root = tree.getroot()  

        # 检查根节点是否为configuration  
        if root.tag != 'configuration':  
            # 如果不是，则创建一个新的configuration节点，并将原有内容作为其子节点  
            new_root = ET.Element('configuration')  
            new_root.append(root)  
            tree._setroot(new_root)  
            root = new_root  

        # 检查是否有property节点  
        property_node = root.find('property')  
        if not property_node:  
            # 创建新的property节点  
            property_node = ET.SubElement(root, 'property')  

        if len(property_node) == 0:  

            # 创建name和value节点  
            name_node = ET.SubElement(property_node, 'name')  
            name_node.text = 'mapreduce.framework.name'  

            value_node = ET.SubElement(property_node, 'value')  
            value_node.text = 'yarn'  

            # 将修改写回文件  
            tree.write(file_path, encoding='utf-8', xml_declaration=True)  

            # 加载XML文件  
            parser = etree.XMLParser(remove_blank_text=True)  
            tree = etree.parse(file_path, parser)  
            root = tree.getroot()  

            # 检查根节点是否为configuration  
            if root.tag != 'configuration':  
                # 如果不是，则可能需要更复杂的处理，这里假设它就是  
                raise ValueError("Expected root element to be 'configuration'")  

            # 创建新的注释和样式表声明节点  
            xml_stylesheet = etree.ProcessingInstruction('xml-stylesheet', 'type="text/xsl" href="configuration.xsl"')  
            comment = etree.Comment('\n  Licensed under the Apache License, Version 2.0 (the "License");\n'  
                                    '  you may not use this file except in compliance with the License.\n'  
                                    '  You may obtain a copy of the License at\n'  
                                    '\n'  
                                    '    http://www.apache.org/licenses/LICENSE-2.0\n'  
                                    '\n'  
                                    '  Unless required by applicable law or agreed to in writing, software\n'  
                                    '  distributed under the License is distributed on an "AS IS" BASIS,\n'  
                                    '  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n'  
                                    '  See the License for the specific language governing permissions and\n'  
                                    '  limitations under the License. See accompanying LICENSE file.\n')  

            # 在configuration节点之前插入新的节点  
            root.addprevious(xml_stylesheet)  
            root.addprevious(comment)  

            # 为了保持格式化输出，我们使用lxml的pretty_print函数  
            # 首先，我们需要创建一个新的XML树，因为不能直接修改原文件  
            new_tree = etree.ElementTree(root)  

            # 格式化并写入文件  
            with open(file_path, 'wb') as f:  
                new_tree.write(f, pretty_print=True, xml_declaration=True, encoding='utf-8')
        print("mapred-site.xml updated")
        
    else:  
        print(f"{file_path} does not exist")


def update_yarn_site_xml(base_dir):
    # XML文件路径  
    file_path = rf'{base_dir}\etc\hadoop\yarn-site.xml'  

    # 检查文件是否存在  
    if os.path.exists(file_path):  
        # 解析XML文件  
        tree = ET.parse(file_path)  
        root = tree.getroot()  

        # 检查根节点是否为configuration  
        if root.tag != 'configuration':  
            # 如果不是，则创建一个新的configuration节点，并将原有内容作为其子节点  
            new_root = ET.Element('configuration')  
            new_root.append(root)  
            tree._setroot(new_root)  
            root = new_root  

        # 检查是否有property节点  
        property_node = root.find('property')  
        if not property_node:  
            # 创建新的property节点  
            property_node = ET.SubElement(root, 'property')  

        if len(property_node) == 0:

            # 创建name和value节点  
            name_node = ET.SubElement(property_node, 'name')
            name_node.text = 'yarn.nodemanager.aux-services'  

            value_node = ET.SubElement(property_node, 'value')
            value_node.text = 'mapreduce_shuffle'  
            
            # 创建新的property节点  
            property_node2 = ET.SubElement(root, 'property')

            # 创建name和value节点  
            name_node = ET.SubElement(property_node2, 'name')  
            name_node.text = 'yarn.nodemanager.aux-services.mapreduce.shuffle.class'  

            value_node = ET.SubElement(property_node2, 'value')  
            value_node.text = 'org.apache.hadoop.mapred.ShuffleHandler'
            
            # 创建新的property节点  
            property_node3 = ET.SubElement(root, 'property')

            # 创建name和value节点  
            name_node = ET.SubElement(property_node3, 'name')  
            name_node.text = 'yarn.nodemanager.resource.memory-mb'  

            value_node = ET.SubElement(property_node3, 'value')  
            value_node.text = '1024'
            
            # 创建新的property节点  
            property_node4 = ET.SubElement(root, 'property')

            # 创建name和value节点  
            name_node = ET.SubElement(property_node4, 'name')  
            name_node.text = 'yarn.nodemanager.resource.cpu-vcores'  

            value_node = ET.SubElement(property_node4, 'value')  
            value_node.text = '1'

            # 将修改写回文件  
            tree.write(file_path, encoding='utf-8', xml_declaration=True)  

            # 加载XML文件  
            parser = etree.XMLParser(remove_blank_text=True)  
            tree = etree.parse(file_path, parser)  
            root = tree.getroot()  

            # 检查根节点是否为configuration  
            if root.tag != 'configuration':  
                # 如果不是，则可能需要更复杂的处理，这里假设它就是  
                raise ValueError("Expected root element to be 'configuration'")  

            # 创建新的注释和样式表声明节点  
            xml_stylesheet = etree.ProcessingInstruction('xml-stylesheet', 'type="text/xsl" href="configuration.xsl"')  
            comment = etree.Comment('\n  Licensed under the Apache License, Version 2.0 (the "License");\n'  
                                    '  you may not use this file except in compliance with the License.\n'  
                                    '  You may obtain a copy of the License at\n'  
                                    '\n'  
                                    '    http://www.apache.org/licenses/LICENSE-2.0\n'  
                                    '\n'  
                                    '  Unless required by applicable law or agreed to in writing, software\n'  
                                    '  distributed under the License is distributed on an "AS IS" BASIS,\n'  
                                    '  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n'  
                                    '  See the License for the specific language governing permissions and\n'  
                                    '  limitations under the License. See accompanying LICENSE file.\n')  

            # 在configuration节点之前插入新的节点  
            root.addprevious(xml_stylesheet)  
            root.addprevious(comment)  

            # 为了保持格式化输出，我们使用lxml的pretty_print函数  
            # 首先，我们需要创建一个新的XML树，因为不能直接修改原文件  
            new_tree = etree.ElementTree(root)  

            # 格式化并写入文件  
            with open(file_path, 'wb') as f:  
                new_tree.write(f, pretty_print=True, xml_declaration=True, encoding='utf-8')
        print("yarn-site.xml updated")
        
    else:  
        print(f"{file_path} does not exist")


def hadoop_download_decompress(base_dir):
    if os.path.exists(base_dir) and os.path.isdir(base_dir):  
        print(f"Hadoop {version} downloaded & decompressed")
    else:
        print(f"Hadoop {version} started downloading")
        process = subprocess.run(['curl',f'https://archive.apache.org/dist/hadoop/common/{hadoop}/{hadoop}.tar.gz','-o',f'{hadoop}.tar.gz.tmp'], shell=True, check=True)
        os.rename(f'{hadoop}.tar.gz.tmp',f'{hadoop}.tar.gz')
        print(f"Hadoop {version} downloaded")
        process = subprocess.run(['tar','-xzvf',f'{hadoop}.tar.gz','-C',path], shell=True, check=True)
        if os.path.exists(base_dir) and os.path.isdir(base_dir):  
            print(f"Hadoop {version} decompressed")


def update_hadoop_env_cmd(base_dir):
    # 批处理文件路径
    file_path = rf'{base_dir}\etc\hadoop\hadoop-env.cmd'  

    # 检查文件是否存在  
    if os.path.exists(file_path):
        java_dir = 'C:\\jre-1.8\\'
        if not os.path.exists(java_dir):   
            os.system(f'xcopy /s /i "{java_path}" {java_dir}')
            # 设置文件路径  
            temp_file_path = file_path + '.tmp'  

            # 读取文件内容  
            with open(file_path, 'r') as file:  
                file_content = file.read()  

            # 替换内容  
            new_content = file_content.replace('set JAVA_HOME=%JAVA_HOME%', f'set JAVA_HOME={java_dir}')  

            # 将新内容写入到临时文件  
            with open(temp_file_path, 'w') as file:  
                file.write(new_content)  

            # 删除原始文件并将临时文件重命名为原始文件  
            os.remove(file_path)  
            os.rename(temp_file_path, file_path)
        print("hadoop-env.cmd updated")
        
    else:  
        print(f"{file_path} does not exist")


def copy_jar_file(base_dir):
    source = rf'{base_dir}\share\hadoop\yarn\timelineservice\hadoop-yarn-server-timelineservice-3.1.3.jar'
    target = rf'{base_dir}\share\hadoop\yarn\hadoop-yarn-server-timelineservice-3.1.3.jar'
    if os.path.exists(source) and os.path.isfile(source):
        if not os.path.exists(target):
            # adding exception handling
            try:
               copyfile(source, target)
            except IOError as e:
               print("Unable to copy file. %s" % e)
            except:
               print("Unexpected error:", sys.exc_info())
        print("'hadoop-yarn-server-timelineservice-3.1.3.jar' copy done")
    else:  
        print(f"{source} does not exist")


jre_8_install()
hadoop_download_decompress(directory_path)
set_hadoop_home()
set_java_library_path()
set_hadoop_common_lib_native_dir()
create_directories(directory_path)
update_core_site_xml(directory_path)
update_hdfs_site_xml(directory_path)
update_mapred_site_xml(directory_path)
update_yarn_site_xml(directory_path)
update_hadoop_env_cmd(directory_path)
print('hdfs namenode -format')
print(f'Download apache-{hadoop}-winutils.zip from https://github.com/s911415/apache-hadoop-3.1.3-winutils or https://gitcode.com/weixin_307779131/apache-hadoop-3.1.3-winutils/tree/master or https://download.csdn.net/download/weixin_43576022/12381058')
print(rf'decompress and copy the files to {directory_path}\bin and copy "hadoop.dll" and "hdfs.dll" to {directory_path}\lib\native')
copy_jar_file(directory_path)
print('start-all.cmd')
print('http://localhost:8088/cluster')
print('http://localhost:9870')