`
log_cd
  • 浏览: 1098685 次
  • 性别: Icon_minigender_1
  • 来自: 成都
社区版块
存档分类
最新评论

Java Regex To Use

阅读更多
 /**
  * 得到文件所在的磁盘目录
  * @param file
  * @return
  */
 public static String getFileDirectory(String file){
  String regEx = "[a-zA-z]{1,4}:.*[\\\\/]";
  String dir = "";
        Pattern p=Pattern.compile(regEx); 
  Matcher m=p.matcher(file); 
  if(m.find()){
   dir = m.group(m.groupCount());
  }
  return  dir;
 }
 
 /**
  * 得到文件名
  * @param file
  * @return
  */
 public static String getFileName(String file){
  String regEx =".+[\\\\|/](.+)$";
  String fileName = "";
        Pattern p=Pattern.compile(regEx); 
  Matcher m=p.matcher(file); 
  if(m.find()){
   fileName = m.group(m.groupCount());
  }
  return  fileName;
 }
 
 /**
  * 得到文件扩展名
  * @param file
  * @return
  */
 public static String getFileExtName(String file){
  String regEx = ".*\\.";
  Pattern p = Pattern.compile(regEx);
  Matcher m = p.matcher(file);
  String extName = m.replaceAll("");
  return extName;
 }

/**
 * 当在模式中存在用括号括起来的组时,可以分别检索每个组的匹配值。从最左边的组开始编为1,
 * 然后依次对每对括号相对应的组进行编号。在下面的模式中,第一组是协议(如http),第二组是域名。
 * 为了在匹配的字符串中访问组,可以使用Matcher的group方法。
 */
public static void getMatchGroup(String data) {
	String urlString = "(http|https|ftp)://([a-zA-Z0-9-\\.]+)[/\\w\\.\\-\\+\\?%=&;:,#]*";
	Pattern urlPattern = Pattern.compile(urlString);
	Matcher urlMatcher = urlPattern.matcher(data);
	while (urlMatcher.find()) {
		String domain = urlMatcher.group(2); // 2nd group is the domain
		System.out.println(domain);
	}
}

/**
 * 在一个模式内引用一个以前的匹配组称为逆向引用(backreference),简化模式书写。
 * 为了对第三个组进行逆向引用,在模式中包括\3即可。这将会只匹配一个与以前的组相匹配的严格重复的数据。
 * eg.String data = " The the water molecules are made of of hydrogen and oxygen";
 */
public static void getBackReferencesGroup(String data) {
	//该模式匹配情况如下:一个空白字符、特殊的单词列表中的一个单词、更多的空白、
	//再次重复的相同的单词(使用\1对第一个组进行逆向引用)以及空白符或标点符号。
	String patternStr = "\\s(of|or|the|to)\\s+\\1[\\s\\.,;]";

	Pattern wordPattern = Pattern.compile(patternStr,
	Pattern.CASE_INSENSITIVE);//不区分大小写
	Matcher wordMatcher = wordPattern.matcher(data);
	while (wordMatcher.find()) {
		int start = wordMatcher.start();
		String word = wordMatcher.group(1);
		System.out.println("Repeated " + word + " starting at " + start);
	}
}

/**
 * 取大括号内的内容
 * @param inputStr
 * @return
 */
public static String getBraceContent(String inputStr){
    Pattern pattern = Pattern.compile("(?<=\\{)[^\\{\\}]*(?=\\})", 2);
    Matcher matcher = pattern.matcher(inputStr);
	StringBuffer sb = new StringBuffer();
	
	String temp;
	while(matcher.find()){
		temp = inputStr.substring(matcher.start(), matcher.end());
		sb.append(temp+"\n");
	}
	
	return sb.toString();
}

 /**
  * 得到html标签的属性
  * @param html 文件内容
  * @param label 要提取属性的标签名称,如:font ,img...
  */
 public static void getHtmlAttribute(String html,String label){
  Map mapAttrib = new HashMap();
  String regEx = "<"+label+"\\s*([^>]*)\\s*>";
  String regEx2 = "([a-z]+)\\s*=\\s*\"([^\"]+)\"";

  Pattern p = Pattern.compile(regEx);
  Matcher m = p.matcher(html);
  if(m.find()){
   String attribs = m.group(1);
   p = Pattern.compile(regEx2);
   m = p.matcher(attribs);
   while(m.find()){
    mapAttrib.put(m.group(1), m.group(2));
   }
  }
  printMapData(mapAttrib);
 }
 
 public static void printMapData(Map map){
  Set     entries   =   map.entrySet(); 
  Iterator   iter   =   entries.iterator(); 
  while(iter.hasNext()) 
  { 
         Map.Entry   entry   =   (Map.Entry)iter.next(); 
       System.out.println(entry.getKey()+"="+entry.getValue());
  } 
 }


 /**
  * 使用Jacob工具包完成word到html的转换
  * @param absPath 文件绝对路径
  */
 public static boolean wordFormatToHtml(String absPath) throws ProgramException{

     String FileFormat = "";
     FileFormat = getFileExtName(absPath);//文件类型

     if(FileFormat.equalsIgnoreCase("doc"))
     {
         String DocFile = absPath;
         //word文件的完整路径

         String HtmlFile = DocFile.substring(0, (DocFile.length() - 4)) + ".htm";
         //html文件的完整路径

         ActiveXComponent app = new ActiveXComponent("Word.Application");
         //启动word

         try{
           app.setProperty("Visible", new Variant(false));
           //设置word程序非可视化运行
           Dispatch docs = app.getProperty("Documents").toDispatch();
           Dispatch doc = Dispatch.invoke(docs,"Open", Dispatch.Method, new Object[]{DocFile,new Variant(false), new Variant(true)}, new int[1]).toDispatch(); 
           //打开word文件
           Dispatch oWordBasic = (Dispatch) Dispatch.call(app, "WordBasic").getDispatch();
           
           Dispatch.call(oWordBasic, "AcceptAllChangesInDoc");
           
           Dispatch.invoke(doc,"SaveAs",Dispatch.Method, new Object[]{HtmlFile,new Variant(8)}, new int[1]);
           //作为htm格式保存文件

           Dispatch.call(doc, "Close",new Variant(false));
           //关闭文件
         }
         catch (Exception e)
         {
    throw new ProgramException("error$Word转换为HTML时出错!");
         }
         finally
         {
           app.invoke("Quit", new Variant[] {});
           //退出word程序
         }
         //转化完毕
         return true;
     }
     return false;
   }
 
 /**
  * 逐行读取HTML文件内容
  * @param filePath  HTML文件的路径
  * @return
  * @throws ProgramException
  */
 public static String getHTMLContent(String filePath) throws ProgramException{
  StringBuffer sb=new StringBuffer();
  try{
  String line="";
  File file=new File(filePath);
  InputStreamReader read = new InputStreamReader (new FileInputStream(file));
  BufferedReader br=new BufferedReader(read);
  while((line=br.readLine())!=null){
   sb.append(line);
   sb.append('\n');//注意换行符写入
  }
  }catch(FileNotFoundException e){
   throw new ProgramException("error$读HTML文件时,文件没有找到");
  }catch(IOException e){
   throw new ProgramException("error$读HTML文件时,出现IO异常");
  }
  String temp=sb.toString();
  //不管图片
  String regEx = "<img\\s*([^>]*)\\s*>";
  Pattern p = Pattern.compile(regEx);
  Matcher m = p.matcher(temp);
  temp=m.replaceAll("");

  String regEx2 = "<v:imagedata\\s*([^>]*)\\s*>";
  Pattern p2 = Pattern.compile(regEx2);
  Matcher m2 = p2.matcher(temp);
  temp=m2.replaceAll("");
  
  temp = temp.replace("\'", "\"");
  return temp;
 }

说明:
特殊构造(非捕获)
(?:X) X,作为非捕获组
(?idmsux-idmsux)  Nothing,但是将匹配标志由 on 转为 off
(?idmsux-idmsux:X)   X,作为带有给定标志 on - off 的非捕获组
(?=X) X,通过零宽度的正 lookahead
(?!X) X,通过零宽度的负 lookahead
(?<=X) X,通过零宽度的正 lookbehind
(?<!X) X,通过零宽度的负 lookbehind
(?>X) X,作为独立的非捕获组
public static Pattern compile(String regex,int flags);
参数:
regex - 要编译的表达式。
flags - 匹配标志,可能包括 CASE_INSENSITIVE、MULTILINE、DOTALL、UNICODE_CASE 和 CANON_EQ 的位掩码。
Pattern中的定义如下:
public static final int UNIX_LINES = 0x01;
public static final int CASE_INSENSITIVE = 0x02;
public static final int COMMENTS = 0x04;
public static final int MULTILINE = 0x08;
public static final int LITERAL = 0x10;
public static final int DOTALL = 0x20;
public static final int UNICODE_CASE = 0x40;
public static final int CANON_EQ = 0x80;

资源:
1.java.util.regex类 Pattern
分享到:
评论
1 楼 shappy1978 2010-02-09  
谢谢,上面的方法对于接收修订内容可行,可以解决POI无法解析文本是否属于修订内容的问题。

相关推荐

    Packt.Java.9.Regular.Expressions

    Going forward, you will learn to use zero-length assertions and lookarounds, parsing the source code, and processing the log files. Finally, you will master tips, tricks, and best practices in regex ...

    java字符串匹配

    3.Build a program using Java array of string, you need to input 5 or more most famous universities in the world, and the annual ... (Hint: one of solutions is to use java.util.regex API 正则表达式)

    Jeffrey E. F. Friedl - Mastering.Regular.Expressions.3rd.Edition

    this edition has been updated throughout to reflect advances in other languages, including expanded in-depth coverage of Sun's java.util.regex package, which has emerged as the standard Java regex ...

    UE(官方下载)

    "Tagging" the find data allows UltraEdit/UEStudio to re-use the data similar to variable during a replace. For example, If ^(h*o^) ^(f*s^) matches "hello folks", ^2 ^1 would replace it with "folks ...

    正则表达式工具:JGsoft RegexBuddy v3.4.2 零售版(无需要注册激活)

    Use the regex with source code snippets automatically adjusted to the particulars of your programming language. Collect and document libraries of regular expressions for future reuse. GREP (search-...

    Inverted-Index

    倒排索引- unix join - http://www.albany.edu/~ig4895/join.htm- get path-filenames - http://stackoverflow.com/questions/6844785/how-to-use-regex-with-find-command- executing unix commands from java - ...

    带注释的Bootstrap.java

    import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.catalina.Globals; import org.apache.catalina.security.SecurityClassLoad; import org.apache.catalina.startup....

    streamflyer-regex-fast:字符流上的快速匹配

    // we use FastRegexModifier instead of RegexModifier Modifier fastModifier = new FastRegexModifier("edit(\\s+)stream", Pattern.DOTALL, "modify$1stream"); // create the modifying reader that wraps the ...

    Android代码-口语化的正则表达式

    You can use SNAPSHOT dependency with adding to pom.xml: ossrh https://oss.sonatype.org/content/repositories/snapshots Examples VerbalExpression testRegex = VerbalExpression.regex() .startOfLine...

    JGsoft.RegexBuddy.v3.4.2

    Use the regex with source code snippets automatically adjusted to the particulars of your programming language. Collect and document libraries of regular expressions for future reuse. GREP (search-...

    Mastering Regular Expressions, 3rd Edition

    They are now standard features in a wide range of languages and popular tools, including Perl, Python, Ruby, Java, VB.NET and C# (and any language using the .NET Framework), PHP, and MySQL....

    ZendFramework中文文档

    9.1.2. Why Use Zend_Date? 9.2. 操作理论 9.2.1. 内部(Internals) 9.3. Basic Methods 9.3.1. The current date 9.3.2. Zend_Date by Example 9.3.2.1. Ouput a Date 9.3.2.2. Setting a Date 9.3.2.3. ...

Global site tag (gtag.js) - Google Analytics