hadoop ExtendedFileUtil -

zhrglchp

浏览: 115340 次
性别:
来自: 北京

最近访客更多访客>>

linxl2011

gggfff39

7jkl

happyzhaow

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

hadoop ExtendedFileUtil

博客分类：

hadoop

在Hadoop编写生产环境的任务时，定义以下任务，要求是相同的MapReduce任务，但Hadoop0.20API中并不总是可用。
1) 获取HDFS文件或目录的大小
 通过查看执行任务的输入数据的数量，动态改变使用到任务中的reducer的数量。
2) 从HDFS目录中递归移除所有零字节文件
 reducer中使用MultipleOutput类时（作用比Mapper中要小），会产生很多这类文件。很多时间reducer获取不到MultipleOutput文件的任何记录，最好是在任务完成后删除。
3) 递归获取某个目录的所有子目录
4) 递归获取某个目录的所有文件和目录的子目录
 默认地，现在，运行hadoop任务时，它只处理输入目录最新文件，输入路径下子目录的任何文件不处理，因此如果想要处理子目录下的所有文件，最好创建一个列表，用逗号分隔所有的输入路径下的文件，再提交给任务执行。

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;

import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Stack;
import java.util.regex.Pattern;

public class ExtendedFileUtil extends FileUtil {
    private String[] getFilesAndDirectories(String fileOrDirList, boolean recursively
            , boolean getDirectories, boolean getFiles) throws IOException {
        Configuration configuration = new Configuration();
        String root = configuration.get("fs.default.name");

        ArrayList<String> arraylist = new ArrayList<String>();

        Stack<Path> stack = new Stack<Path>();
        String uri = null;

        FileSystem fs1 = null;
        String[] fileOrDir = fileOrDirList.split(",", -1);
        for (String aFileOrDir : fileOrDir) {
            if (aFileOrDir.indexOf(root) == -1) {
                uri = root + aFileOrDir;
            } else {
                uri = aFileOrDir;
            }
            FileSystem fs = FileSystem.get(URI.create(uri), configuration);

            Path[] paths = new Path[1];
            paths[0] = new Path(uri);
            FileStatus[] status = fs.listStatus(paths);
            for (FileStatus statu : status) {
                if (statu.isDir()) {
                    stack.push(statu.getPath());
                    if (getDirectories) {
                        arraylist.add(statu.getPath().toString());
                    }
                } else {
                    if (getFiles) {
                        arraylist.add(statu.getPath().toString());
                    }
                }
            }

            if (recursively) {
                Path p1 = null;
                FileStatus[] status1 = null;
                while (!stack.empty()) {
                    p1 = stack.pop();
                    fs1 = FileSystem.get(URI.create(p1.toString()), configuration);
                    paths[0] = new Path(p1.toString());
                    status1 = fs1.listStatus(paths);

                    for (FileStatus aStatus1 : status1) {
                        if (aStatus1.isDir()) {
                            stack.push(aStatus1.getPath());
                            if (getDirectories) {
                                arraylist.add(aStatus1.getPath().toString());
                            }
                        } else {
                            if (getFiles) {
                                arraylist.add(aStatus1.getPath().toString());
                            }
                        }
                    }
                }
            }
            fs.close();
        }
        arraylist.trimToSize();
        String[] returnArray = new String[arraylist.size()];

        return arraylist.toArray(returnArray);
    }

    /**
     * @param fileOrDir   Comma delimited list of input files or directories in HDFS. Input can be given with HDFS URL.
     *                    i.e. "hdfs://hd4.ev1.yellowpages.com:9000/user/directory" and "/user/directory" means the same
     * @param recursively When set to "true" then recursively opens all sub directories and returns files
     */
    public String[] getFilesOnly(String fileOrDir, boolean recursively) throws IOException {
        return this.getFilesAndDirectories(fileOrDir, recursively, false, true);
    }

    /**
     * Same as String[] getFilesOnly(String fileOrDir, boolean recursively) except that it only returns paths
     * that match the regex
     */
    public String[] getFilesOnly(String fileOrDir, boolean recursively, String regex) throws IOException {
        ArrayList<String> arraylist = new ArrayList<String>();
        String[] tempArr = this.getFilesOnly(fileOrDir, recursively);
        Pattern p = Pattern.compile(".*" + regex + ".*");
        // Extract the file names that match the regex
        for (String aTempArr : tempArr) {
            if (p.matcher(aTempArr).matches()) {
                arraylist.add(aTempArr);
            }
        }
        arraylist.trimToSize();
        String[] returnArray = new String[arraylist.size()];
        returnArray = arraylist.toArray(returnArray);
        return returnArray;
    }

    /**
     * @param fileOrDir   Comma delimited list of input files or directories in HDFS. Input can be given with HDFS URL.
     *                    i.e. "hdfs://hd4.ev1.yellowpages.com:9000/user/directory" and "/user/directory" means the same
     * @param recursively When set to "true" then recursively opens all sub directories and returns sub directories
     */
    public String[] getDirectoriesOnly(String fileOrDir, boolean recursively) throws IOException {
        return this.getFilesAndDirectories(fileOrDir, recursively, true, false);
    }

    /**
     * @param fileOrDir   Comma delimited list of input files or directories in HDFS. Input can be given with HDFS URL.
     *                    i.e. "hdfs://hd4.ev1.yellowpages.com:9000/user/directory" and "/user/directory" means the same
     * @param recursively When set to "true" then recursively opens all sub directories and returns files and sub directories
     */
    public String[] getFilesAndDirectories(String fileOrDir, boolean recursively) throws IOException {
        return this.getFilesAndDirectories(fileOrDir, recursively, true, true);
    }

    /**
     * This method uses recursion to retrieve a list of files/directories
     *
     * @param p             Path to the directory or file you want to start at.
     * @param configuration Configuration
     * @param files         a Map<Path,FileStatus> of path names to FileStatus objects.
     * @throws IOException
     */
    public void getFiles(Path p, Configuration configuration, Map<Path, FileStatus> files) throws IOException {
        FileSystem fs = FileSystem.get(p.toUri(), configuration);
        if (files == null) {
            files = new HashMap();
        }
        if (fs.isFile(p)) {
            files.put(p, fs.getFileStatus(p));
        } else {
            FileStatus[] statuses = fs.listStatus(p);
            for (FileStatus s : statuses) {
                if (s.isDir()) {
                    getFiles(s.getPath(), configuration, files);
                } else {
                    files.put(s.getPath(), s);
                }
            }
        }
        fs.close();
    }

    /**
     * This method deletes all zero byte files within a directory and all its subdirectories
     *
     * @param fileOrDir If file then delete the file if its zero bytes, if directory then delete
     *                  all zero bytes files from the directory
     */
    public void removeAllZeroByteFiles(String fileOrDir) {
        try {
            Configuration configuration = new Configuration();
            Map<Path, FileStatus> files = new HashMap<Path, FileStatus>();
            this.getFiles(new Path(fileOrDir), configuration, files);
            for (Path p : files.keySet()) {
                FileStatus s = files.get(p);
                if (s.getLen() == 0) {
                    FileSystem fs = FileSystem.get(p.toUri(), configuration);
                    fs.delete(p, false);
                    fs.close();
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * This method returns the size of file or a directory in HDFS.
     *
     * @param fileOrDir file or diretory or list of files or directories in HDFS, if directory then size of all
     *                  files within the directory and its subdirectories are returned
     * @return size of the file or directory (sum of all files in the directory and sub directories)
     */
    public long size(String fileOrDir) throws IOException {
        long totalSize = 0;
        Configuration configuration = new Configuration();
        String allFiles[] = fileOrDir.split(",", -1);

        for (String allFile : allFiles) {
            Path p = new Path(allFile);
            FileSystem fs = FileSystem.get(p.toUri(), configuration);
            totalSize = totalSize + fs.getContentSummary(p).getLength();
            fs.close();
        }
        return totalSize;
    }

    /**
     * The method moves a single or multiple files or directories, if exists, to trash.
     * It also accepts list of hdfs file or directory delimited by comma.
     *
     * @param fileOrDir HDFS file or directory name or list of HDFS file or directory names
     * @throws IOException
     */

    public void removeHdfsPath(String fileOrDir)
            throws IOException {
        Configuration configuration = new Configuration();
        FileSystem fs = FileSystem.newInstance(URI.create(fileOrDir), configuration);
        String[] fileList = fileOrDir.split(",", -1);
        Trash trash = new Trash(configuration);
        trash.expunge();
        for (String aFileList : fileList) {
            Path p = new Path(aFileList);
            if (fs.exists(p)) {
                trash.moveToTrash(p);
            }
        }
        fs.close();
    }
}

分享到：

HadoopFileUtil | hadoop StringUtil

2012-03-01 14:34
浏览 1060
评论(0)
分类:非技术
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

hadoop ExtendedFileUtil

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

hadoop ExtendedFileUtil

评论

发表评论

相关推荐

mapreduce Bet

hadoop 输出格式

hadoop mapreduce 原理

hadoop搭建问题

hadoop输出文件格式

hadoop 学习

hadoop提高性能建议

hadoop例子

hadoop

Hadoop Hive与Hbase整合

hive hadoop 代码解析

Hadoop MapReduce操作MySQL

hadoop hdfs常用操作类

hdfs 操作类自己的

hadoo 文件常用操作

Mapper,Reducer,Wrapper的Java模板

hadoop基础知识

hadoop 自己封装的接口

HadoopFileUtil

hadoop StringUtil

最近访客更多访客>>