论坛首页 Java企业应用论坛

POI 关于对 ms word的读写代码

浏览 33356 次
该帖已经被评为良好帖
作者 正文
   发表时间:2006-09-14  
read word:
public class WordExtractor {
	public WordExtractor() {
	}

	public String extractText(InputStream in) throws IOException {
		ArrayList text = new ArrayList();
		POIFSFileSystem fsys = new POIFSFileSystem(in);

		DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
		DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
		byte[] header = new byte[headerProps.getSize()];

		din.read(header);
		din.close();
		// Prende le informazioni dall'header del documento
		int info = LittleEndian.getShort(header, 0xa);

		boolean useTable1 = (info & 0x200) != 0;

		//boolean useTable1 = true;
		
		// Prende informazioni dalla piece table
		int complexOffset = LittleEndian.getInt(header, 0x1a2);
		//int complexOffset = LittleEndian.getInt(header);
		
		String tableName = null;
		if (useTable1) {
			tableName = "1Table";
		} else {
			tableName = "0Table";
		}

		DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
		byte[] tableStream = new byte[table.getSize()];

		din = fsys.createDocumentInputStream(tableName);

		din.read(tableStream);
		din.close();

		din = null;
		fsys = null;
		table = null;
		headerProps = null;

		int multiple = findText(tableStream, complexOffset, text);

		StringBuffer sb = new StringBuffer();
		int size = text.size();
		tableStream = null;

		for (int x = 0; x < size; x++) {
			
			WordTextPiece nextPiece = (WordTextPiece) text.get(x);
			int start = nextPiece.getStart();
			int length = nextPiece.getLength();

			boolean unicode = nextPiece.usesUnicode();
			String toStr = null;
			if (unicode) {
				toStr = new String(header, start, length * multiple, "UTF-16LE");
			} else {
				toStr = new String(header, start, length, "ISO-8859-1");
			}
			sb.append(toStr).append(" ");

		}
		return sb.toString();
	}

	private static int findText(byte[] tableStream, int complexOffset, ArrayList text)
		throws IOException {
		//actual text
		int pos = complexOffset;
		int multiple = 2;
		//skips through the prms before we reach the piece table. These contain	data
		//for actual fast saved files
		while (tableStream[pos] == 1) {
			pos++;
			int skip = LittleEndian.getShort(tableStream, pos);
			pos += 2 + skip;
		}
		if (tableStream[pos] != 2) {
			throw new IOException("corrupted Word file");
		} else {
			//parse out the text pieces
			int pieceTableSize = LittleEndian.getInt(tableStream, ++pos);
			pos += 4;
			int pieces = (pieceTableSize - 4) / 12;
			for (int x = 0; x < pieces; x++) {
				int filePos =
					LittleEndian.getInt(tableStream, pos + ((pieces + 1) * 4) + (x * 8) + 2);
				boolean unicode = false;
				if ((filePos & 0x40000000) == 0) {
					unicode = true;
				} else {
					unicode = false;
					multiple = 1;
					filePos &= ~(0x40000000); //gives me FC in doc stream
					filePos /= 2;
				}
				int totLength =
					LittleEndian.getInt(tableStream, pos + (x + 1) * 4)
						- LittleEndian.getInt(tableStream, pos + (x * 4));

				WordTextPiece piece = new WordTextPiece(filePos, totLength, unicode);
				text.add(piece);

			}

		}
		return multiple;
	}
	public static void main(String[] args){
		WordExtractor w  = new WordExtractor();
		POIFSFileSystem ps = new POIFSFileSystem();
		try{
			
			File file = new File("C:\\test.doc");
			
			InputStream in = new FileInputStream(file);
			String s = w.extractText(in);
			System.out.println(s);
	
			
		}catch(Exception e){
			e.printStackTrace();
		}
				
	}

}
class WordTextPiece {
	private int _fcStart;
	private boolean _usesUnicode;
	private int _length;

	public WordTextPiece(int start, int length, boolean unicode) {
		_usesUnicode = unicode;
		_length = length;
		_fcStart = start;
	}
	public boolean usesUnicode() {
		return _usesUnicode;
	}

	public int getStart() {
		return _fcStart;
	}
	public int getLength() {
		return _length;
	}

}


write word


	public boolean writeWordFile(String path, String content) {
		boolean w = false;
		try {
	
		//	byte b[] = content.getBytes("ISO-8859-1");
			byte b[] = content.getBytes();
			
			ByteArrayInputStream bais = new ByteArrayInputStream(b);

			POIFSFileSystem fs = new POIFSFileSystem();
			DirectoryEntry directory = fs.getRoot();

			DocumentEntry de = directory.createDocument("WordDocument", bais);

			FileOutputStream ostream = new FileOutputStream(path);

			fs.writeFilesystem(ostream);
			
			bais.close();
			ostream.close();

		} catch (IOException e) {
			e.printStackTrace();
		}

		return w;
	}

写操作的代码还是有些问题:打开WORD时提示要选择字符类型
希望能改进!


当然这几个jar是少不了的
poi-2.5.1-final-20040804.jar
poi-contrib-2.5.1-final-20040804.jar
poi-scratchpad-2.5.1-final-20040804.jar
   发表时间:2006-09-14  
不错,收藏一下。论坛里这方面帖子不多
0 请登录后投票
   发表时间:2006-09-14  
代码里应该关了表情符号吧
0 请登录后投票
   发表时间:2007-02-07  
写word的代码不对!
0 请登录后投票
   发表时间:2007-02-07  
希望楼主把表情符号关掉 重新编辑一下 让大家能欣赏到正确的代码
0 请登录后投票
   发表时间:2007-03-06  
感谢楼主,这个word extractor能够比较好的支持中文。
原先我是使用nutch的word文本提取,但是相当大部分的中文word文档无法正确提取,到官方网站查看他们的解决方案,是这么说的“Document with 2-byte characters (that's how Chinese characters are probably stored) are not correctly handled by HWPF.”One more thing you need to consider: HWPF cannot handle "fast saved" Word files. If the documents you need to parse are "fast saved" this adds an extra level of complexity.


有点小问题,希望楼主有时间的时候帮忙大家修复一下,那就是有部分提取的文本前后有小方框的,我想应该是这些字符本不该被提取。
0 请登录后投票
   发表时间:2007-03-15  
poi-2.5.1-final-20040804.jar
poi-contrib-2.5.1-final-20040804.jar
poi-scratchpad-2.5.1-final-20040804.jar

能提供这几个包的下载吗?
0 请登录后投票
   发表时间:2007-04-02  
http://jakarta.apache.org/poi/index.html
0 请登录后投票
   发表时间:2007-04-02  
写word的少见,收了
0 请登录后投票
   发表时间:2007-06-15  
问个问题,写work java哪个开源包比较好
我看POI对word支持不太够啊
0 请登录后投票
论坛首页 Java企业应用版

跳转论坛:
Global site tag (gtag.js) - Google Analytics