import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
public class DateRangePathFilter implements PathFilter {
private final Pattern PATTERN = Pattern.compile("^.*/(\\d\\d\\d\\d/\\d\\d/\\d\\d).*$");
private final Date start, end;
public DateRangePathFilter(Date start, Date end) {
this.start = new Date(start.getTime());
this.end = new Date(end.getTime());
}
public boolean accept(Path path) {
Matcher matcher = PATTERN.matcher(path.toString());
if (matcher.matches()) {
DateFormat format = new SimpleDateFormat("yyyy/MM/dd");
try {
return inInterval(format.parse(matcher.group(1)));
} catch (ParseException e) {
return false;
}
}
return false;
}
private boolean inInterval(Date date) {
return !date.before(start) && !date.after(end);
}
}
// cc FileCopyWithProgress Copies a local file to a Hadoop filesystem, and shows progress
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.util.Progressable;
// vv FileCopyWithProgress
public class FileCopyWithProgress {
public static void main(String[] args) throws Exception {
String localSrc = args[0];
String dst = args[1];
InputStream in = new BufferedInputStream(new FileInputStream(localSrc));
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(dst), conf);
OutputStream out = fs.create(new Path(dst), new Progressable() {
public void progress() {
System.out.print(".");
}
});
IOUtils.copyBytes(in, out, 4096, true);
}
}
// ^^ FileCopyWithProgress
// cc FileSystemCat Displays files from a Hadoop filesystem on standard output by using the FileSystem directly
import java.io.InputStream;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
// vv FileSystemCat
public class FileSystemCat {
public static void main(String[] args) throws Exception {
String uri = args[0];
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
InputStream in = null;
try {
in = fs.open(new Path(uri));
IOUtils.copyBytes(in, System.out, 4096, false);
} finally {
IOUtils.closeStream(in);
}
}
}
// ^^ FileSystemCat
// cc FileSystemDoubleCat Displays files from a Hadoop filesystem on standard output twice, by using seek
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
// vv FileSystemDoubleCat
public class FileSystemDoubleCat {
public static void main(String[] args) throws Exception {
String uri = args[0];
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
FSDataInputStream in = null;
try {
in = fs.open(new Path(uri));
IOUtils.copyBytes(in, System.out, 4096, false);
in.seek(0); // go back to the start of the file
IOUtils.copyBytes(in, System.out, 4096, false);
} finally {
IOUtils.closeStream(in);
}
}
}
// ^^ FileSystemDoubleCat
// cc ListStatus Shows the file statuses for a collection of paths in a Hadoop filesystem
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
// vv ListStatus
public class ListStatus {
public static void main(String[] args) throws Exception {
String uri = args[0];
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path[] paths = new Path[args.length];
for (int i = 0; i < paths.length; i++) {
paths[i] = new Path(args[i]);
}
FileStatus[] status = fs.listStatus(paths);
Path[] listedPaths = FileUtil.stat2Paths(status);
for (Path p : listedPaths) {
System.out.println(p);
}
}
}
// ^^ ListStatus
// cc RegexExcludePathFilter A PathFilter for excluding paths that match a regular expression
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
// vv RegexExcludePathFilter
public class RegexExcludePathFilter implements PathFilter {
private final String regex;
public RegexExcludePathFilter(String regex) {
this.regex = regex;
}
public boolean accept(Path path) {
return !path.toString().matches(regex);
}
}
// ^^ RegexExcludePathFilter
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
public class RegexPathFilter implements PathFilter {
private final String regex;
private final boolean include;
public RegexPathFilter(String regex) {
this(regex, true);
}
public RegexPathFilter(String regex, boolean include) {
this.regex = regex;
this.include = include;
}
public boolean accept(Path path) {
return (path.toString().matches(regex)) ? include : !include;
}
}
// cc URLCat Displays files from a Hadoop filesystem on standard output using a URLStreamHandler
import java.io.InputStream;
import java.net.URL;
import org.apache.hadoop.fs.FsUrlStreamHandlerFactory;
import org.apache.hadoop.io.IOUtils;
// vv URLCat
public class URLCat {
static {
URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory());
}
public static void main(String[] args) throws Exception {
InputStream in = null;
try {
in = new URL(args[0]).openStream();
IOUtils.copyBytes(in, System.out, 4096, false);
} finally {
IOUtils.closeStream(in);
}
}
}
// ^^ URLCat
分享到:
相关推荐
在Hadoop生态系统中,`FileSystem API`是核心组件之一,它提供了一套接口,使得开发者可以方便地进行文件系统的操作,如...在学习和使用过程中,理解其核心概念、常用方法以及如何与其他Hadoop组件结合是至关重要的。
在深入探讨Lucene Field之前,我们先来...在实际应用中,结合其他相关技术,如Solr(基于Lucene的搜索引擎服务器)、Netty(高性能网络通信框架)和Hadoop(大数据处理框架),可以构建更复杂、高效的搜索解决方案。
22. 学习新技术:关注云计算、人工智能、大数据等领域的新技术,如Kubernetes、Spark、Hadoop等,拓宽视野。 23. 持续集成/持续部署(CI/CD):了解Jenkins、GitLab CI/CD等工具,实现自动化构建和部署,提高开发效率...
一步一步跟我学习lucene是对近期做lucene索引的总结,大家有问题的话联系本人的Q-Q: 891922381,同时本人新建Q-Q群:106570134(lucene,solr,netty,hadoop),如蒙加入,不胜感激,大家共同探讨,本人争取每日一博,...
- **终身学习的理念**:在快速变化的技术领域中,持续学习是跟上行业发展步伐的重要手段。通过不断学习新技术、新工具,IT从业者能够保持竞争力并适应市场需求的变化。 #### 乐于课堂实践 - **理论与实践相结合**:...
FourInOne(中文名字“四不像”)是一个四合一分布式计算框架,在写这个框架之前,我也看了老外写的其他开源框架,也对分布式计算进行了长时间的思考,当我们把复杂的hadoop当作一门学科学习时,似乎忘记了我们想...
以IBM的蓝云计算平台为例,其设计上大量使用了IBM的先进大规模计算技术,包含了虚拟化软件Xen和PowerVM,Linux操作系统映像,以及开源软件Hadoop。这样的设计使得数据中心可以使用类似于互联网的计算环境,从而更...
Fourinone(中文名字“四不像”)是一个四合一分布式计算框架,在写这个框架之前,我对分布式计算进行了长时间的思考,也看了老外写的其他开源框架,当我们把复杂的hadoop当作一门学科学习时,似乎忘记了我们想解决问题...
淘宝Fourinone(中文名字“四不像”)是一个四合一分布式计算框架,在写这个框架之前,我对分布式计算进行了长时间的思考,也看了老外写的其他开源框架,当我们把复杂的hadoop当作一门学科学习时,似乎忘记了我们想...
- **易于理解**:内容组织清晰,语言通俗易懂,即使是初学者也能轻松跟上学习节奏。 #### 四、读者反馈与评价 - **JavaLobby**:“如果你计划在应用中使用 Lucene 或者对 Lucene 能为你做什么感兴趣,这绝对是必读...