从一篇文章中筛选处辞典生词本中没有的单词，导出成txt文件

wangshu3000

浏览: 137481 次
性别:
来自: 大连

最近访客更多访客>>

nosuchperson

k-ping

itsfh

anyinger023

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

J2EE企业计算
幸福生活

代码些的比较烂，只是从几篇文章中过滤处单词本中没有的单词，导出成单独的一个文件，再手动一个一个录入到单词本中。。小工具。。mark一下。。使用dom4j。

<?xml version="1.0" encoding="UTF-8"?>
<wordbook><item>
		<word>daunting</word>
		<trans><![CDATA[adj. 使人畏缩的；使人气馁的；令人怯步的 
daunting: 令人沮丧 | 使人畏缩的 | 使人气馁的]]></trans>
		<phonetic><![CDATA[['dɔ:ntiŋ]]]></phonetic>
		<tags/>
		<progress>10</progress>
	</item><item>
		<word>informative</word>
		<trans><![CDATA[adj. 教育性的，有益的；情报的；见闻广博的 
informative: 告知性的 | 使知道消息的 | 有益的]]></trans>
		<phonetic><![CDATA[[in'fɔ:mətiv]]]></phonetic>
		<tags/>
		<progress>10</progress>
	</item><item>
		<word>contribute</word>
		<trans><![CDATA[vt. 贡献，出力；投稿；捐献 
vt. 贡献，出力；投稿；捐献 
contribute: 贡献 | 捐助 | 做出贡献]]></trans>
		<phonetic><![CDATA[[kən'tribju:t]]]></phonetic>
		<tags/>
		<progress>10</progress>
	</item>

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;

public class WordNewMain {

	/**
	 * @param args
	 * @throws DocumentException
	 * @throws IOException
	 */
	public static void main(String[] args) throws DocumentException, IOException {
		Map<String, Word> listMap = new HashMap<String, Word>();
		SAXReader saxReader = new SAXReader();
		saxReader.setEncoding("utf-8");
		Document whole1Xml = saxReader.read(new BufferedInputStream(new FileInputStream("all_sych.xml")));
		List<Element> whole1List = whole1Xml.selectNodes("//wordbook/item");
		System.out.println("whole1 List Size:" + whole1List.size());
		for (int i = 0; i < whole1List.size(); i++) {
			Element e = whole1List.get(i);
			Node word = e.selectSingleNode("word");
			Node trans = e.selectSingleNode("trans");
			Node phonetic = e.selectSingleNode("phonetic");
			Node tags = e.selectSingleNode("tags");
			Node progress = e.selectSingleNode("progress");
			Word w = listMap.get(word.getStringValue());
			if (w != null && Integer.parseInt(w.getProgress()) < Integer.parseInt(progress.getStringValue())) {
				w.setProgress(progress.getStringValue());
			} else if (w == null) {
				e.detach();
				w = new Word(word.getStringValue(), trans.getStringValue(), phonetic.getStringValue(),
						tags.getStringValue(), progress.getStringValue());
			}
			listMap.put(word.getStringValue().toLowerCase(), w);
		}

		// txt
		BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(new File("8word.txt"))));
		System.out.println("Ok, find the file!");
		String line = null;
		byte[] wordB = new byte[30];
		Map<String, String> countArea = new HashMap<String, String>();
		int wordBP = 0;
		String theWord = null;
		System.out.println("Start count~");
		FileWriter fw = new FileWriter("result.txt");

		while ((line = br.readLine()) != null) {
			boolean inWord = true;
			byte[] lineB = line.getBytes();
			for (int i = 0; i < lineB.length; i++) {
				// is a character
				if ((lineB[i] < 91 && lineB[i] > 64) || (lineB[i] < 123 && lineB[i] > 96)) {
					wordB[wordBP] = lineB[i];
					wordBP = wordBP + 1;
					inWord = true;
				} else if (inWord) {
					theWord = new String(wordB).trim().toLowerCase();
					if (listMap.get(theWord) == null && theWord.length() > 1) {
						countArea.put(theWord, theWord);
					}
					wordBP = 0;
					inWord = false;
					wordB = new byte[30];
				}
			}
			if (inWord) {
				theWord = new String(wordB).trim().toLowerCase();
				if (listMap.get(theWord) == null && theWord.length() > 1) {
					countArea.put(theWord, theWord);
				}
				wordBP = 0;
				inWord = false;
				wordB = new byte[30];
			}
		}
		br.close();

		// steven txt
		br = new BufferedReader(new InputStreamReader(new FileInputStream(new File("steve_4p.txt"))));
		System.out.println("Ok, find the file!");
		while ((line = br.readLine()) != null) {
			boolean inWord = true;
			byte[] lineB = line.getBytes();
			for (int i = 0; i < lineB.length; i++) {
				// is a character
				if ((lineB[i] < 91 && lineB[i] > 64) || (lineB[i] < 123 && lineB[i] > 96)) {
					wordB[wordBP] = lineB[i];
					wordBP = wordBP + 1;
					inWord = true;
				} else if (inWord) {
					theWord = new String(wordB).trim().toLowerCase();
					if (listMap.get(theWord) == null && theWord.length() > 1) {
						countArea.put(theWord, theWord);
					}
					wordBP = 0;
					inWord = false;
					wordB = new byte[30];
				}
			}
			if (inWord) {
				theWord = new String(wordB).trim().toLowerCase();
				if (listMap.get(theWord) == null && theWord.length() > 1) {
					countArea.put(theWord, theWord);
				}
				wordBP = 0;
				inWord = false;
				wordB = new byte[30];
			}
		}

		// GRE text
		br = new BufferedReader(new InputStreamReader(new FileInputStream(new File("gre.txt"))));
		System.out.println("Ok, find the file!");
		while ((line = br.readLine()) != null) {
			boolean inWord = true;
			byte[] lineB = line.getBytes();
			for (int i = 0; i < lineB.length; i++) {
				// is a character
				if ((lineB[i] < 91 && lineB[i] > 64) || (lineB[i] < 123 && lineB[i] > 96)) {
					wordB[wordBP] = lineB[i];
					wordBP = wordBP + 1;
					inWord = true;
				} else if (inWord) {
					theWord = new String(wordB).trim().toLowerCase();
					if (listMap.get(theWord) == null && theWord.length() > 1) {
						countArea.put(theWord, theWord);
					}
					wordBP = 0;
					inWord = false;
					wordB = new byte[30];
				}
			}
			if (inWord) {
				theWord = new String(wordB).trim().toLowerCase();
				if (listMap.get(theWord) == null && theWord.length() > 1) {
					countArea.put(theWord, theWord);
				}
				wordBP = 0;
				inWord = false;
				wordB = new byte[30];
			}
		}

		// GaoZhong text
		br = new BufferedReader(new InputStreamReader(new FileInputStream(new File("gz.txt"))));
		System.out.println("Ok, find the file!");
		while ((line = br.readLine()) != null) {
			boolean inWord = true;
			byte[] lineB = line.getBytes();
			for (int i = 0; i < lineB.length; i++) {
				// is a character
				if ((lineB[i] < 91 && lineB[i] > 64) || (lineB[i] < 123 && lineB[i] > 96)) {
					wordB[wordBP] = lineB[i];
					wordBP = wordBP + 1;
					inWord = true;
				} else if (inWord) {
					theWord = new String(wordB).trim().toLowerCase();
					if (listMap.get(theWord) == null && theWord.length() > 1) {
						countArea.put(theWord, theWord);
					}
					wordBP = 0;
					inWord = false;
					wordB = new byte[30];
				}
			}
			if (inWord) {
				theWord = new String(wordB).trim().toLowerCase();
				if (listMap.get(theWord) == null && theWord.length() > 1) {
					countArea.put(theWord, theWord);
				}
				wordBP = 0;
				inWord = false;
				wordB = new byte[30];
			}
		}

		// output
		Iterator<String> it = countArea.keySet().iterator();
		while (it.hasNext()) {
			fw.write(it.next() + "\r\n");
		}
		fw.close();
		System.out.println("End count~");
		System.out.println("Sum word of steve is :" + countArea.size());
	}

}

分享到：

Senior Solution Architect(Systems) | Android程序全屏方法

2012-02-23 16:28
浏览 1114
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

从一篇文章中筛选处辞典生词本中没有的单词，导出成txt文件

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

从一篇文章中筛选处辞典生词本中没有的单词，导出成txt文件

评论

发表评论

相关推荐

MyLab

Learning

Eclipse 绘制草图的plugin

My Environment

话单匹配问题

Senior Solution Architect(Systems)

关于火车订票系统瓶颈的分析及解决方案

[疑问]关于NIO的耗时服务端业务逻辑问题？？？？

[小代码]蜘蛛爬虫，抓取某网站所有图片文章中的图片~

写了个小代码，统计史蒂夫乔布斯传英文版的单词数量

软件项目版本号的命名规则及格式介绍

中文编码基础知识介绍

软件架构

理解架构师

架构和架构设计师

构架师之路

软件架构师成长之路

某公司的技术能力

大型网站架构之:MySpace的体系架构

IT网站

最近访客更多访客>>