POI实现超大数据的Excel的读写操作，支持Excel最大行数。

thinkgem

浏览: 590759 次
性别:
来自: 济南

最近访客更多访客>>

thesixonenine

sense_yip

u010868738

bbwang8088

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

Java
POI
Heracles
JeeSite
开源项目

前端时间写了注解方式Excel的读取和写入，它是根据注解完成Excel的操作，虽说支持大数据，但对于超大数据就无能为力了，因为它的读写期间都是将所有数据放入系统内存的，除非你有超大的内存。

因项目需要对超大数据的Excel读写操作，于是网上找了个超大数据的读写代码，这个不需要太大内存。并对此进行了简单的修改。

原理如下：

Excel超大数据读取：抽象Excel2007读取器，excel2007的底层数据结构是xml文件，采用SAX的事件驱动的方法解析 xml，需要继承DefaultHandler，在遇到文件内容时，事件会触发，这种做法可以大大降低内存的耗费，特别使用于大数据量的文件。

Excel超大数据写入：抽象excel2007读入器，先构建.xlsx一张模板，改写模板中的sheet.xml, 使用这种方法写入.xlsx文件，不需要太大的内存。

先看调用示例：

String file = "E:/导入测试数据.xlsx";

ExcelReader reader = new ExcelReader() {
	public void getRows(int sheetIndex, int curRow, List<String> rowList) {
		
		System.out.println("Sheet:" + sheetIndex + ", Row:" + curRow + ", Data:" +rowList);
		
	}
};
reader.process(file, 1);

String file = "E:/导出测试数据.xlsx";

ExcelWriter writer = new ExcelWriter() {
	public void generate() throws Exception {
		
		// 电子表格开始
		this.beginSheet();
		
		for (int rownum = 0; rownum < 100; rownum++) {
			// 插入新行
			this.insertRow(rownum);
			
			// 建立新单元格,索引值从0开始,表示第一列
			this.createCell(0, "第 " + rownum + " 行");
			this.createCell(1, 34343.123456789);
			this.createCell(2, "23.67%");
			this.createCell(3, "12:12:23");
			this.createCell(4, "2014-10-11 12:12:23");
			this.createCell(5, "true");
			this.createCell(6, "false");

			// 结束行
			this.endRow();
		}
		
		// 电子表格结束
			this.endSheet();
		}
	};
	writer.process(file);
}

这里只展示了对数据的读取和写入，如果正式保存到数据库时建议读取一部分（如100条）再写入一次数据库，尽量不要读取一条就写入一条，这样会非常耗费资源。

源代码如下：

import java.io.InputStream;
import java.math.BigDecimal;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;

import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;

/**
 * Excel超大数据读取，抽象Excel2007读取器，excel2007的底层数据结构是xml文件，采用SAX的事件驱动的方法解析
 * xml，需要继承DefaultHandler，在遇到文件内容时，事件会触发，这种做法可以大大降低 内存的耗费，特别使用于大数据量的文件。
 * @version 2014-9-2
 */
public abstract class ExcelReader extends DefaultHandler {

	// 共享字符串表
	private SharedStringsTable sst;
	
	// 上一次的内容
	private String lastContents;
	private boolean nextIsString;

	private int sheetIndex = -1;
	private List<String> rowList = new ArrayList<String>();

	// 当前行
	private int curRow = 0;
	// 当前列
	private int curCol = 0;
	// 日期标志
	private boolean dateFlag;
	// 数字标志
	private boolean numberFlag;

	private boolean isTElement;

	/**
	 * 遍历工作簿中所有的电子表格
	 * @param filename
	 * @throws Exception
	 */
	public void process(String filename) throws Exception {
		OPCPackage pkg = OPCPackage.open(filename);
		XSSFReader r = new XSSFReader(pkg);
		SharedStringsTable sst = r.getSharedStringsTable();
		XMLReader parser = fetchSheetParser(sst);
		Iterator<InputStream> sheets = r.getSheetsData();
		while (sheets.hasNext()) {
			curRow = 0;
			sheetIndex++;
			InputStream sheet = sheets.next();
			InputSource sheetSource = new InputSource(sheet);
			parser.parse(sheetSource);
			sheet.close();
		}
	}

	/**
	 * 只遍历一个电子表格，其中sheetId为要遍历的sheet索引，从1开始，1-3
	 * @param filename
	 * @param sheetId
	 * @throws Exception
	 */
	public void process(String filename, int sheetId) throws Exception {
		OPCPackage pkg = OPCPackage.open(filename);
		XSSFReader r = new XSSFReader(pkg);
		SharedStringsTable sst = r.getSharedStringsTable();
		XMLReader parser = fetchSheetParser(sst);
		// 根据 rId# 或 rSheet# 查找sheet
		InputStream sheet2 = r.getSheet("rId" + sheetId);
		sheetIndex++;
		InputSource sheetSource = new InputSource(sheet2);
		parser.parse(sheetSource);
		sheet2.close();
	}

	public XMLReader fetchSheetParser(SharedStringsTable sst)
			throws SAXException {
		XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
		this.sst = sst;
		parser.setContentHandler(this);
		return parser;
	}

	public void startElement(String uri, String localName, String name,
			Attributes attributes) throws SAXException {

//		System.out.println("startElement: " + localName + ", " + name + ", " + attributes);

		// c => 单元格
		if ("c".equals(name)) {
			// 如果下一个元素是 SST 的索引，则将nextIsString标记为true
			String cellType = attributes.getValue("t");
			if ("s".equals(cellType)) {
				nextIsString = true;
			} else {
				nextIsString = false;
			}
			// 日期格式
			String cellDateType = attributes.getValue("s");
			if ("1".equals(cellDateType)) {
				dateFlag = true;
			} else {
				dateFlag = false;
			}
			String cellNumberType = attributes.getValue("s");
			if ("2".equals(cellNumberType)) {
				numberFlag = true;
			} else {
				numberFlag = false;
			}

		}
		// 当元素为t时
		if ("t".equals(name)) {
			isTElement = true;
		} else {
			isTElement = false;
		}

		// 置空
		lastContents = "";
	}

	public void endElement(String uri, String localName, String name)
			throws SAXException {

//		System.out.println("endElement: " + localName + ", " + name);

		// 根据SST的索引值的到单元格的真正要存储的字符串
		// 这时characters()方法可能会被调用多次
		if (nextIsString) {
			try {
				int idx = Integer.parseInt(lastContents);
				lastContents = new XSSFRichTextString(sst.getEntryAt(idx))
						.toString();
			} catch (Exception e) {

			}
		}
		// t元素也包含字符串
		if (isTElement) {
			String value = lastContents.trim();
			rowList.add(curCol, value);
			curCol++;
			isTElement = false;
			// v => 单元格的值，如果单元格是字符串则v标签的值为该字符串在SST中的索引
			// 将单元格内容加入rowlist中，在这之前先去掉字符串前后的空白符
		} else if ("v".equals(name)) {
			String value = lastContents.trim();
			value = value.equals("") ? " " : value;
			try {
				// 日期格式处理
				if (dateFlag) {
					Date date = HSSFDateUtil.getJavaDate(Double.valueOf(value));
					SimpleDateFormat dateFormat = new SimpleDateFormat("dd/MM/yyyy");
					value = dateFormat.format(date);
				}
				// 数字类型处理
				if (numberFlag) {
					BigDecimal bd = new BigDecimal(value);
					value = bd.setScale(3, BigDecimal.ROUND_UP).toString();
				}
			} catch (Exception e) {
				// 转换失败仍用读出来的值
			}
			rowList.add(curCol, value);
			curCol++;
		} else {
			// 如果标签名称为 row ，这说明已到行尾，调用 optRows() 方法
			if (name.equals("row")) {
				getRows(sheetIndex + 1, curRow, rowList);
				rowList.clear();
				curRow++;
				curCol = 0;
			}
		}

	}

	public void characters(char[] ch, int start, int length)
			throws SAXException {
		// 得到单元格内容的值
		lastContents += new String(ch, start, length);
	}

	/**
	 * 获取行数据回调
	 * @param sheetIndex
	 * @param curRow
	 * @param rowList
	 */
	public abstract void getRows(int sheetIndex, int curRow, List<String> rowList);

	/**
	 * 测试方法
	 */
	public static void main(String[] args) throws Exception {

String file = "E:/导入测试数据.xlsx";

ExcelReader reader = new ExcelReader() {
	public void getRows(int sheetIndex, int curRow, List<String> rowList) {
		
		System.out.println("Sheet:" + sheetIndex + ", Row:" + curRow + ", Data:" +rowList);
		
	}
};
reader.process(file, 1);

	}

}

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Writer;
import java.util.Calendar;
import java.util.Enumeration;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.zip.ZipOutputStream;

import org.apache.poi.hssf.util.CellReference;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

/**
 * Excel超大数据写入，抽象excel2007读入器，先构建.xlsx一张模板，改写模板中的sheet.xml,
 * 使用这种方法 写入.xlsx文件，不需要太大的内存
 * @version 2014-9-2
 */
public abstract class ExcelWriter {

	private SpreadsheetWriter sw;

	/**
	 * 写入电子表格的主要流程
	 * 
	 * @param fileName
	 * @throws Exception
	 */
	public void process(String fileName) throws Exception {
		
		// 建立工作簿和电子表格对象
		XSSFWorkbook wb = new XSSFWorkbook();
		XSSFSheet sheet = wb.createSheet("sheet1");
		
		// 持有电子表格数据的xml文件名 例如 /xl/worksheets/sheet1.xml
		String sheetRef = sheet.getPackagePart().getPartName().getName();

		// 保存模板
		FileOutputStream os = new FileOutputStream("template.xlsx");
		wb.write(os);
		os.close();

		// 生成xml文件
		File tmp = File.createTempFile("sheet", ".xml");
		Writer fw = new FileWriter(tmp);
		sw = new SpreadsheetWriter(fw);
		generate();
		fw.close();

		// 使用产生的数据替换模板
		File templateFile = new File("template.xlsx");
		FileOutputStream out = new FileOutputStream(fileName);
		substitute(templateFile, tmp, sheetRef.substring(1), out);
		out.close();
		// 删除文件之前调用一下垃圾回收器，否则无法删除模板文件
		System.gc();
		// 删除临时模板文件
		if (templateFile.isFile() && templateFile.exists()) {
			templateFile.delete();
		}
	}

	/**
	 * 类使用者应该使用此方法进行写操作
	 * 
	 * @throws Exception
	 */
	public abstract void generate() throws Exception;

	public void beginSheet() throws IOException {
		sw.beginSheet();
	}

	public void insertRow(int rowNum) throws IOException {
		sw.insertRow(rowNum);
	}

	public void createCell(int columnIndex, String value) throws IOException {
		sw.createCell(columnIndex, value, -1);
	}

	public void createCell(int columnIndex, double value) throws IOException {
		sw.createCell(columnIndex, value, -1);
	}

	public void endRow() throws IOException {
		sw.endRow();
	}

	public void endSheet() throws IOException {
		sw.endSheet();
	}

	/**
	 * 
	 * @param zipfile
	 *            the template file
	 * @param tmpfile
	 *            the XML file with the sheet data
	 * @param entry
	 *            the name of the sheet entry to substitute, e.g.
	 *            xl/worksheets/sheet1.xml
	 * @param out
	 *            the stream to write the result to
	 */
	private static void substitute(File zipfile, File tmpfile, String entry,
			OutputStream out) throws IOException {
		ZipFile zip = new ZipFile(zipfile);
		ZipOutputStream zos = new ZipOutputStream(out);

		@SuppressWarnings("unchecked")
		Enumeration<ZipEntry> en = (Enumeration<ZipEntry>) zip.entries();
		while (en.hasMoreElements()) {
			ZipEntry ze = en.nextElement();
			if (!ze.getName().equals(entry)) {
				zos.putNextEntry(new ZipEntry(ze.getName()));
				InputStream is = zip.getInputStream(ze);
				copyStream(is, zos);
				is.close();
			}
		}
		zos.putNextEntry(new ZipEntry(entry));
		InputStream is = new FileInputStream(tmpfile);
		copyStream(is, zos);
		is.close();
		zos.close();
	}

	private static void copyStream(InputStream in, OutputStream out)
			throws IOException {
		byte[] chunk = new byte[1024];
		int count;
		while ((count = in.read(chunk)) >= 0) {
			out.write(chunk, 0, count);
		}
	}

	/**
	 * 在写入器中写入电子表格
	 * 
	 */
	public static class SpreadsheetWriter {
		private final Writer _out;
		private int _rownum;
		private static String LINE_SEPARATOR = System
				.getProperty("line.separator");

		public SpreadsheetWriter(Writer out) {
			_out = out;
		}

		public void beginSheet() throws IOException {
			_out.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
					+ "<worksheet xmlns=\"http://schemas.openxmlformats.org/spreadsheetml/2006/main\">");
			_out.write("<sheetData>" + LINE_SEPARATOR);
		}

		public void endSheet() throws IOException {
			_out.write("</sheetData>");
			_out.write("</worksheet>");
		}

		/**
		 * 插入新行
		 * 
		 * @param rownum
		 *            以0开始
		 */
		public void insertRow(int rownum) throws IOException {
			_out.write("<row r=\"" + (rownum + 1) + "\">" + LINE_SEPARATOR);
			this._rownum = rownum;
		}

		/**
		 * 插入行结束标志
		 */
		public void endRow() throws IOException {
			_out.write("</row>" + LINE_SEPARATOR);
		}

		/**
		 * 插入新列
		 * 
		 * @param columnIndex
		 * @param value
		 * @param styleIndex
		 * @throws IOException
		 */
		public void createCell(int columnIndex, String value, int styleIndex)
				throws IOException {
			String ref = new CellReference(_rownum, columnIndex)
					.formatAsString();
			_out.write("<c r=\"" + ref + "\" t=\"inlineStr\"");
			if (styleIndex != -1)
				_out.write(" s=\"" + styleIndex + "\"");
			_out.write(">");
			_out.write("<is><t>" + encoderXML(value) + "</t></is>");
			_out.write("</c>");
		}

		public void createCell(int columnIndex, String value)
				throws IOException {
			createCell(columnIndex, value, -1);
		}

		public void createCell(int columnIndex, double value, int styleIndex)
				throws IOException {
			String ref = new CellReference(_rownum, columnIndex)
					.formatAsString();
			_out.write("<c r=\"" + ref + "\" t=\"n\"");
			if (styleIndex != -1)
				_out.write(" s=\"" + styleIndex + "\"");
			_out.write(">");
			_out.write("<v>" + value + "</v>");
			_out.write("</c>");
		}

		public void createCell(int columnIndex, double value)
				throws IOException {
			createCell(columnIndex, value, -1);
		}

		public void createCell(int columnIndex, Calendar value, int styleIndex)
				throws IOException {
			createCell(columnIndex, DateUtil.getExcelDate(value, false),
					styleIndex);
		}
	}

	// XML Encode
	private static final String[] xmlCode = new String[256];

	static {
		// Special characters
		xmlCode['\''] = "'";
		xmlCode['\"'] = "\""; // double quote
		xmlCode['&'] = "&"; // ampersand
		xmlCode['<'] = "<"; // lower than
		xmlCode['>'] = ">"; // greater than
	}

	/**
	 * <p>
	 * Encode the given text into xml.
	 * </p>
	 * 
	 * @param string
	 *            the text to encode
	 * @return the encoded string
	 */
	public static String encoderXML(String string) {
		if (string == null)
			return "";
		int n = string.length();
		char character;
		String xmlchar;
		StringBuffer buffer = new StringBuffer();
		// loop over all the characters of the String.
		for (int i = 0; i < n; i++) {
			character = string.charAt(i);
			// the xmlcode of these characters are added to a StringBuffer
			// one by one
			try {
				xmlchar = xmlCode[character];
				if (xmlchar == null) {
					buffer.append(character);
				} else {
					buffer.append(xmlCode[character]);
				}
			} catch (ArrayIndexOutOfBoundsException aioobe) {
				buffer.append(character);
			}
		}
		return buffer.toString();
	}

	/**
	 * 测试方法
	 */
	public static void main(String[] args) throws Exception {

	String file = "E:/导出测试数据.xlsx";
	
	ExcelWriter writer = new ExcelWriter() {
		public void generate() throws Exception {
			
			// 电子表格开始
			this.beginSheet();
			
			for (int rownum = 0; rownum < 100; rownum++) {
				// 插入新行
				this.insertRow(rownum);
				
				// 建立新单元格,索引值从0开始,表示第一列
				this.createCell(0, "第 " + rownum + " 行");
				this.createCell(1, 34343.123456789);
				this.createCell(2, "23.67%");
				this.createCell(3, "12:12:23");
				this.createCell(4, "2014-10-11 12:12:23");
				this.createCell(5, "true");
				this.createCell(6, "false");

				// 结束行
				this.endRow();
			}
			
			// 电子表格结束
			this.endSheet();
		}
	};
	writer.process(file);
}
		
}

分享到：

spring中，在Java任何位置获取request对象 | 数据列表组件 jqGrid 二次封装

2014-11-01 13:34
浏览 29070
评论(3)
分类:开源软件
查看更多

3 楼 laoguan123 2017-06-08

比之前的版本有多大性能上的提升呢？有人做过比较测试吗？

2 楼 luo476979657 2017-03-16

空列，如何处理呢，空的时候插入数据库都错位了

1 楼 tzming1212 2016-11-11

我以前处理这种情况,近200M数据,还有多页,还要验证.
因为页的不同,分不同类,然后继承一个类,继承的那个类,内存太大,报内存溢出.
后来,每个页单独处理,不再继承一个共用类.
结果还是好慢.但可以运行.不会报内存溢出.

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论