`
derek_z
  • 浏览: 16759 次
  • 性别: Icon_minigender_1
  • 来自: 杭州
社区版块
存档分类
最新评论

大数据excel导入 sax

阅读更多
package com.h3c.oos.util;

/* ==================================================================== 
 Licensed to the Apache Software Foundation (ASF) under one or more 
 contributor license agreements.  See the NOTICE file distributed with 
 this work for additional information regarding copyright ownership. 
 The ASF licenses this file to You under the Apache License, Version 2.0 
 (the "License"); you may not use this file except in compliance with 
 the License.  You may obtain a copy of the License at 

 http://www.apache.org/licenses/LICENSE-2.0 

 Unless required by applicable law or agreed to in writing, software 
 distributed under the License is distributed on an "AS IS" BASIS, 
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 See the License for the specific language governing permissions and 
 limitations under the License. 
 ==================================================================== */

import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

/**
 * 使用CVS模式解决XLSX文件,可以有效解决用户模式内存溢出的问题
 * 该模式是POI官方推荐的读取大数据的模式,在用户模式下,数据量较大、Sheet较多、或者是有很多无用的空行的情况
 * ,容易出现内存溢出,用户模式读取Excel的典型代码如下: FileInputStream file=new
 * FileInputStream("c:\\test.xlsx"); Workbook wb=new XSSFWorkbook(file);
 * 
 * 
 * @author 山人
 */
public class XLSXCovertCSVReader {

	/**
	 * The type of the data value is indicated by an attribute on the cell. The
	 * value is usually in a "v" element within the cell.
	 */
	enum xssfDataType {
		BOOL, ERROR, FORMULA, INLINESTR, SSTINDEX, NUMBER,
	}

	/**
	 * 使用xssf_sax_API处理Excel,请参考:
	 * http://poi.apache.org/spreadsheet/how-to.html#xssf_sax_api <p/> Also see
	 * Standard ECMA-376, 1st edition, part 4, pages 1928ff, at
	 * http://www.ecma-international.org/publications/standards/Ecma-376.htm
	 * <p/> A web-friendly version is http://openiso.org/Ecma/376/Part4
	 */
	class MyXSSFSheetHandler extends DefaultHandler {

		/**
		 * Table with styles
		 */
		private StylesTable stylesTable;

		/**
		 * Table with unique strings
		 */
		private ReadOnlySharedStringsTable sharedStringsTable;

		/**
		 * Destination for data
		 */
		private final PrintStream output;

		/**
		 * Number of columns to read starting with leftmost
		 */
		private final int minColumnCount;

		// Set when V start element is seen
		private boolean vIsOpen;

		// Set when cell start element is seen;
		// used when cell close element is seen.
		private xssfDataType nextDataType;

		// Used to format numeric cell values.
		private short formatIndex;
		private String formatString;
		private final DataFormatter formatter;

		private int thisColumn = -1;
		// The last column printed to the output stream
		private int lastColumnNumber = -1;

		// Gathers characters as they are seen.
		private StringBuffer value;
		private String[] record;
		private List<String[]> rows = new ArrayList<String[]>();
		private boolean isCellNull = false;

		/**
		 * Accepts objects needed while parsing.
		 * 
		 * @param styles
		 *            Table of styles
		 * @param strings
		 *            Table of shared strings
		 * @param cols
		 *            Minimum number of columns to show
		 * @param target
		 *            Sink for output
		 */
		public MyXSSFSheetHandler(StylesTable styles,
				ReadOnlySharedStringsTable strings, int cols, PrintStream target) {
			this.stylesTable = styles;
			this.sharedStringsTable = strings;
			this.minColumnCount = cols;
			this.output = target;
			this.value = new StringBuffer();
			this.nextDataType = xssfDataType.NUMBER;
			this.formatter = new DataFormatter();
			record = new String[this.minColumnCount];
			rows.clear();// 每次读取都清空行集合
		}

		/*
		 * (non-Javadoc)
		 * 
		 * @see org.xml.sax.helpers.DefaultHandler#startElement(java.lang.String,
		 *      java.lang.String, java.lang.String, org.xml.sax.Attributes)
		 */
		public void startElement(String uri, String localName, String name,
				Attributes attributes) throws SAXException {

			if ("inlineStr".equals(name) || "v".equals(name)) {
				vIsOpen = true;
				// Clear contents cache
				value.setLength(0);
			}
			// c => cell
			else if ("c".equals(name)) {
				// Get the cell reference
				String r = attributes.getValue("r");
				int firstDigit = -1;
				for (int c = 0; c < r.length(); ++c) {
					if (Character.isDigit(r.charAt(c))) {
						firstDigit = c;
						break;
					}
				}
				thisColumn = nameToColumn(r.substring(0, firstDigit));

				// Set up defaults.
				this.nextDataType = xssfDataType.NUMBER;
				this.formatIndex = -1;
				this.formatString = null;
				String cellType = attributes.getValue("t");
				String cellStyleStr = attributes.getValue("s");
				if ("b".equals(cellType))
					nextDataType = xssfDataType.BOOL;
				else if ("e".equals(cellType))
					nextDataType = xssfDataType.ERROR;
				else if ("inlineStr".equals(cellType))
					nextDataType = xssfDataType.INLINESTR;
				else if ("s".equals(cellType))
					nextDataType = xssfDataType.SSTINDEX;
				else if ("str".equals(cellType))
					nextDataType = xssfDataType.FORMULA;
				else if (cellStyleStr != null) {
					// It's a number, but almost certainly one
					// with a special style or format
					int styleIndex = Integer.parseInt(cellStyleStr);
					XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);
					this.formatIndex = style.getDataFormat();
					this.formatString = style.getDataFormatString();
					if (this.formatString == null)
						this.formatString = BuiltinFormats
								.getBuiltinFormat(this.formatIndex);
				}
			}

		}

		/*
		 * (non-Javadoc)
		 * 
		 * @see org.xml.sax.helpers.DefaultHandler#endElement(java.lang.String,
		 *      java.lang.String, java.lang.String)
		 */
		public void endElement(String uri, String localName, String name)
				throws SAXException {

			String thisStr = null;

			// v => contents of a cell
			if ("v".equals(name)) {
				// Process the value contents as required.
				// Do now, as characters() may be called more than once
				switch (nextDataType) {

				case BOOL:
					char first = value.charAt(0);
					thisStr = first == '0' ? "FALSE" : "TRUE";
					break;

				case ERROR:
					thisStr = "\"ERROR:" + value.toString() + '"';
					break;

				case FORMULA:
					// A formula could result in a string value,
					// so always add double-quote characters.
					thisStr = '"' + value.toString() + '"';
					break;

				case INLINESTR:
					// TODO: have seen an example of this, so it's untested.
					XSSFRichTextString rtsi = new XSSFRichTextString(value
							.toString());
					thisStr = '"' + rtsi.toString() + '"';
					break;

				case SSTINDEX:
					String sstIndex = value.toString();
					try {
						int idx = Integer.parseInt(sstIndex);
						XSSFRichTextString rtss = new XSSFRichTextString(
								sharedStringsTable.getEntryAt(idx));
						thisStr = '"' + rtss.toString() + '"';
					} catch (NumberFormatException ex) {
						output.println("Failed to parse SST index '" + sstIndex
								+ "': " + ex.toString());
					}
					break;

				case NUMBER:
					String n = value.toString();
					// 判断是否是日期格式
					if (HSSFDateUtil.isADateFormat(this.formatIndex, n)) {
						Double d = Double.parseDouble(n);
						Date date = HSSFDateUtil.getJavaDate(d);
						thisStr = formateDateToString(date);
					} else if (this.formatString != null)
						thisStr = formatter.formatRawCellContents(Double
								.parseDouble(n), this.formatIndex,
								this.formatString);
					else
						thisStr = n;
					break;

				default:
					thisStr = "(TODO: Unexpected type: " + nextDataType + ")";
					break;
				}

				// Output after we've seen the string contents
				// Emit commas for any fields that were missing on this row
				if (lastColumnNumber == -1) {
					lastColumnNumber = 0;
				}
				// 判断单元格的值是否为空
				if (thisStr == null || "".equals(isCellNull)) {
					isCellNull = true;// 设置单元格是否为空值
				}
				record[thisColumn] = thisStr;
				// Update column
				if (thisColumn > -1)
					lastColumnNumber = thisColumn;

			} else if ("row".equals(name)) {

				// Print out any missing commas if needed
				if (minColumns > 0) {
					// Columns are 0 based
					if (lastColumnNumber == -1) {
						lastColumnNumber = 0;
					}
					if (isCellNull == false && record[0] != null
							&& record[1] != null)// 判断是否空行
					{
						rows.add(record.clone());
						isCellNull = false;
						for (int i = 0; i < record.length; i++) {
							record[i] = null;
						}
					}
				}
				lastColumnNumber = -1;
			}

		}

		public List<String[]> getRows() {
			return rows;
		}

		public void setRows(List<String[]> rows) {
			this.rows = rows;
		}

		/**
		 * Captures characters only if a suitable element is open. Originally
		 * was just "v"; extended for inlineStr also.
		 */
		public void characters(char[] ch, int start, int length)
				throws SAXException {
			if (vIsOpen)
				value.append(ch, start, length);
		}

		/**
		 * Converts an Excel column name like "C" to a zero-based index.
		 * 
		 * @param name
		 * @return Index corresponding to the specified name
		 */
		private int nameToColumn(String name) {
			int column = -1;
			for (int i = 0; i < name.length(); ++i) {
				int c = name.charAt(i);
				column = (column + 1) * 26 + c - 'A';
			}
			return column;
		}

		private String formateDateToString(Date date) {
			SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");// 格式化日期
			return sdf.format(date);

		}

	}

	// /////////////////////////////////////

	private OPCPackage xlsxPackage;
	private int minColumns;
	private PrintStream output;
	private String sheetName;

	/**
	 * Creates a new XLSX -> CSV converter
	 * 
	 * @param pkg
	 *            The XLSX package to process
	 * @param output
	 *            The PrintStream to output the CSV to
	 * @param minColumns
	 *            The minimum number of columns to output, or -1 for no minimum
	 */
	public XLSXCovertCSVReader(OPCPackage pkg, PrintStream output,
			String sheetName, int minColumns) {
		this.xlsxPackage = pkg;
		this.output = output;
		this.minColumns = minColumns;
		this.sheetName = sheetName;
	}

	/**
	 * Parses and shows the content of one sheet using the specified styles and
	 * shared-strings tables.
	 * 
	 * @param styles
	 * @param strings
	 * @param sheetInputStream
	 */
	public List<String[]> processSheet(StylesTable styles,
			ReadOnlySharedStringsTable strings, InputStream sheetInputStream)
			throws IOException, ParserConfigurationException, SAXException {

		InputSource sheetSource = new InputSource(sheetInputStream);
		SAXParserFactory saxFactory = SAXParserFactory.newInstance();
		SAXParser saxParser = saxFactory.newSAXParser();
		XMLReader sheetParser = saxParser.getXMLReader();
		MyXSSFSheetHandler handler = new MyXSSFSheetHandler(styles, strings,
				this.minColumns, this.output);
		sheetParser.setContentHandler(handler);
		sheetParser.parse(sheetSource);
		return handler.getRows();
	}

	/**
	 * 初始化这个处理程序 将
	 * 
	 * @throws IOException
	 * @throws OpenXML4JException
	 * @throws ParserConfigurationException
	 * @throws SAXException
	 */
	public List<String[]> process() throws IOException, OpenXML4JException,
			ParserConfigurationException, SAXException {

		ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(
				this.xlsxPackage);
		XSSFReader xssfReader = new XSSFReader(this.xlsxPackage);
		List<String[]> list = null;
		StylesTable styles = xssfReader.getStylesTable();
		XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader
				.getSheetsData();
		int index = 0;
		while (iter.hasNext()) {
			InputStream stream = iter.next();
			String sheetNameTemp = iter.getSheetName();
			if (this.sheetName.equals(sheetNameTemp)) {
				list = processSheet(styles, strings, stream);
				stream.close();
				++index;
			}
		}
		return list;
	}

	/**
	 * 读取Excel
	 * 
	 * @param path
	 *            文件路径
	 * @param sheetName
	 *            sheet名称
	 * @param minColumns
	 *            列总数
	 * @return
	 * @throws SAXException
	 * @throws ParserConfigurationException
	 * @throws OpenXML4JException
	 * @throws IOException
	 */
	private static List<String[]> readerExcel(String path, String sheetName,
			int minColumns) throws IOException, OpenXML4JException,
			ParserConfigurationException, SAXException {
		OPCPackage p = OPCPackage.open(path, PackageAccess.READ);
		XLSXCovertCSVReader xlsx2csv = new XLSXCovertCSVReader(p, System.out,
				sheetName, minColumns);
		List<String[]> list = xlsx2csv.process();
		p.close();
		return list;
	}

	public static void main(String[] args) throws Exception {
		List<String[]> list = XLSXCovertCSVReader.readerExcel("F:\\test.xlsx", "Sheet1", 17);
		for (String[] record : list) {
			for (String cell : record) {
				System.out.print(cell + "  ");
			}
			System.out.println();
		}
	}

}

 

分享到:
评论

相关推荐

    poi 自己写的excel sax方式导入大量数据

    以下是一些关于使用Apache POI的SAX方式导入大量Excel数据的关键知识点: 1. **SAX解析原理**: SAX解析器遵循事件驱动模式,当XML文件中的元素开始、结束、遇到文本等事件时,会触发相应的回调函数。对于Excel...

    使用Poi读取大数据量excel的方法

    在Java开发中,Apache POI库是一个非常实用的工具,用于读取...总之,Apache POI结合SAX API为在Android上处理大数据量Excel文件提供了有效方案,通过合理配置和优化,可以显著降低内存消耗,提高应用的稳定性和效率。

    Excel导入导出jar包下载

    在Java开发中,数据的导入导出是一项常见的需求,尤其在处理大量结构化数据时,Excel格式因其易读性、可编辑性和广泛接受度而备受青睐。为了在Java程序中实现Excel的导入导出功能,我们需要依赖一些第三方库,如...

    sax解析Excel

    标题中的“SAX解析Excel”指的是使用SAX(Simple API for XML)解析技术来处理Excel文件。SAX是一种事件驱动的XML解析器,它不像DOM(Document Object Model)那样将整个XML文档加载到内存中,而是逐行读取,因此在...

    excel导入导出bean和action

    此外,大文件的导入导出可能会导致内存溢出,因此需要优化处理策略,比如分批读写数据或者使用SAX解析器以减少内存占用。 6. **最佳实践**:为了提高代码的可维护性和复用性,通常会将Excel相关的操作封装成服务或...

    POI导入Excel并返回校验后的错误文件(原样)下载以及校验错误信息,同时加进度条

    POI导入Excel并返回校验后的错误文件(原样数据文件,并添加批注,注:由于批注只能加1000条,会在Excel后面添加一栏错误信息)下载以及页面展示校验错误信息,同时添加导入进度条,提供页面js和css代码,后端...

    超大xml解析导入数据库、千万级别大数据导出到Excel。实现核心:高性能、分段、分页循环:读取-写入-清空内存。解.zip

    标题提到的"超大XML解析导入数据库"和"千万级别大数据导出到Excel"涉及到两个关键的技术领域:大数据处理和高效数据转换。以下将详细探讨这两个方面的核心实现策略。 首先,对于超大XML文件的解析,XML是一种广泛...

    以读取XML方式的大批量导入Excel

    在IT行业中,数据导入与导出是常见的...总之,“以读取XML方式的大批量导入Excel”是一种高效的处理方式,尤其适用于大数据场景。通过XML解析和Java的工具库,我们可以构建出一个内存友好且性能优越的数据导入系统。

    excel导入导出

    - 为了性能考虑,读取大型Excel文件时,可以使用SAX API代替DOM API,避免一次性加载整个文件到内存。 - 使用完工作簿、工作表、行和单元格后,记得关闭以释放资源。 - 处理日期和公式时,要特别注意单元格类型,...

    SAX解析XML POI解析Excel实例

    &lt;users&gt; &lt;user id="A001"&gt; &lt;name&gt;zhaoyun&lt;/name&gt; &lt;age&gt;40&lt;/age&gt; &lt;/user&gt; &lt;user id="b001"&gt; &lt;name&gt;Liubie&lt;/name&gt; &lt;age&gt;25&lt;/age&gt; &lt;/user&gt; &lt;/users&gt;

    java xml excel文件导入导出

    在Java开发中,处理Excel和XML文件是一项常见的任务,特别是在数据导入导出、报表生成以及数据交换等场景。本文将详细讲解如何使用Java进行Excel和XML文件的解析及操作。 一、Excel文件处理 1. **Apache POI库**:...

    EXCEL导入导出

    标题“EXCEL导入导出”涉及的是在编程中处理Excel文件的技术,这通常是指通过代码将数据导入到Excel表格中,以及从Excel表格导出数据。这种操作在数据分析、报表生成、自动化工作流程中非常常见。描述中的链接指向了...

    Java实现excel大数据量导入

    总结起来,Java中处理Excel大数据量导入的关键在于采用低内存消耗的策略,如SAX解析器,以及理解不同Excel版本的文件结构和限制。通过这些技术,可以高效地处理大量数据,避免对系统资源造成过大压力。

    Java解析XML文档(二):sax读取xml文件导出excel

    本文将深入讲解如何使用SAX(Simple API for XML)解析XML文档,并将解析结果导出到Excel文件中。SAX是一种事件驱动的解析方式,相比DOM(Document Object Model)解析,它更节省内存,适用于处理大型XML文件。 ...

    java web 导入 excel

    在Java Web开发中,导入Excel是一项常见的需求,例如用于数据导入、数据分析或者用户上传数据等场景。本知识点将深入探讨如何在Java Web环境下处理Excel文件,主要包括以下内容: 1. **环境准备**: - Java运行...

    SAX的jar包 SAX的jar包

    SAX的jar包 SAX的jar包SAX的jar包 SAX的jar包 SAX的jar包

    poi导入导出Excel表格的所以jar包

    标题提到的"poi导入导出Excel表格的所以jar包"指的是使用Apache POI库进行Excel操作所需的依赖库。 1. **poi-3.8-20120326.jar**: 这是Apache POI的主要核心库,包含了处理Excel文件的核心类和方法。例如,你可以...

    sax.jar sax.jar

    sax.jar sax.jar sax.jar sax.jar sax.jar sax.jar sax.jar

    POI导入Excel文件--form表单提交

    - 对于大量数据,考虑使用SAX API,它以流式处理Excel,内存消耗较小。 通过以上步骤,我们可以利用Apache POI有效地导入Excel文件,并结合form表单提交的数据进行操作。这个过程在数据导入、批量处理或者数据分析...

Global site tag (gtag.js) - Google Analytics