`
mengqingyu
  • 浏览: 333764 次
  • 性别: Icon_minigender_1
  • 来自: 天津
社区版块
存档分类
最新评论

HttpClient抓取解析网站支持多种验证方式

阅读更多
工作中遇到了抓取多个项目数据并且有多种验证方式包括Http标准验证和非标准验证NTLM、BASIC,写了个较通用的抓数框架,支持多线程,用到的开源框架有HttpClient 4.23、Jsoup、JSONObject、Spring3.0,注意HttpClient版本不同版本API不同。
设计思路:基于bean+spring配置文件方式,配置多个项目属性,实现项目自动登录,实现通用接口或抽象类,自定义解析类,最后通过url传参,反射实例化对象,实现方法的通用。

<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="
		http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-3.0.xsd">

	<!-- 登陆网站设置 可配置多个登陆网站策略 -->
	<bean class="com.berheley.bi.grp.fetch.handler.HttpHandler" init-method="init">
		<property name="maxTotal" value="400"/>
		<property name="maxRoute" value="200"/>
		<property name="cnTimeOut" value="60000"/>
		<property name="soTimeOut" value="60000"/>
		<property name="attributes">
			<map>
				<entry key="60.28.43.164"> <!-- 域名或IP地址、端口号 -->
					<bean class="com.berheley.bi.grp.fetch.pojo.HttpAttributes">
						<property name="packPath" value="com.berheley.bi.grp.fetch.custom.business"/>
						<property name="domain" value="60.28.43.164"/>
						<property name="port" value="80"/>   
						<property name="loginUrl" value=""/>	<!-- 登陆提交表单全路径地址  如http://www.iteye.com/login.jsp -->
						<property name="errorUrl" value=""/>	<!-- 登陆失败之后的请求地址 如/error.jsp -->
						<property name="scheme" value="NTLM"/>
						<property name="params">
							<map>
								<entry key="username" value="登录名"/>
								<entry key="password" value="密码"/>
							</map>
						</property>
					</bean>
				</entry>
			</map>
		</property>
	</bean>
</beans>


import java.util.Map;

import org.apache.http.client.HttpClient;

/**
 * 
 * 类功能描述:远程登录项目属性类
 *
 * @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
 * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp  $
 * Create:  2014-2-26 下午01:49:02
 */
public class HttpAttributes {
	
	//发请求对象
	private HttpClient httpClient;
	
	//解析当前域下网页类的包路径
	private String packPath;
	
	//域名或IP地址
	private String domain = ""; 
	
	//登陆提交表单全路径地址  如http://www.iteye.com/login.jsp
	private String loginUrl = ""; 
	
	//登陆失败之后的请求地址 如/error.jsp
	private String errorUrl = ""; 
	
	//端口号
	private int port = 80;
	
	//登陆参数
	private Map<String,String> params;
	
	//验证方式
	private String scheme;

	public HttpClient getHttpClient() {
		return httpClient;
	}

	public void setHttpClient(HttpClient httpClient) {
		this.httpClient = httpClient;
	}

	public String getPackPath() {
		return packPath;
	}

	public void setPackPath(String packPath) {
		this.packPath = packPath;
	}

	public String getDomain() {
		return domain;
	}

	public void setDomain(String domain) {
		this.domain = domain;
	}

	public String getLoginUrl() {
		return loginUrl;
	}

	public void setLoginUrl(String loginUrl) {
		this.loginUrl = loginUrl;
	}

	public String getErrorUrl() {
		return errorUrl;
	}

	public void setErrorUrl(String errorUrl) {
		this.errorUrl = errorUrl;
	}

	public int getPort() {
		return port;
	}

	public void setPort(int port) {
		this.port = port;
	}

	public Map<String, String> getParams() {
		return params;
	}

	public void setParams(Map<String, String> params) {
		this.params = params;
	}

	public String getScheme() {
		return scheme;
	}

	public void setScheme(String scheme) {
		this.scheme = scheme;
	}
}

import java.util.Map;

/**
 * 
 * 类功能描述:解析统一接口
 *
 * @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
 * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp @param <T> $
 * Create:  2014-2-26 下午01:53:10
 */
public interface IParse<T> {

	/**
	 * 
	 * @function:url中以m_开头的自定义参数
	 * @param params
	 * @return
	 * @author: mengqingyu    2014-3-4 上午09:32:54
	 */
	abstract T process(Map<String, Object> params);
}

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

/**
 * 
 * 类功能描述:解析html网页抽象类,解析html可以继承扩展此类,如有通用方法可以写到此类中,进一步完善
 *
 * @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
 * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp  $
 * Create:  2014-2-19 下午01:53:53
 * @param <T>
 */
public abstract class HtmlParse<T> implements IParse<T>{
	
	protected Log log = LogFactory.getLog(HtmlParse.class);
	
	protected Document doc;
	
	public HtmlParse(String doc) {
		this.doc = Jsoup.parse(doc);
	}
}

import net.sf.json.JSONObject;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * 
 * 类功能描述:解析html网页抽象类,解析html可以继承扩展此类,如有通用方法可以写到此类中,进一步完善
 *
 * @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
 * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp  $
 * Create:  2014-2-19 下午01:53:53
 * @param <T>
 */
public abstract class JsonParse<T> implements IParse<T>{
	
	protected Log log = LogFactory.getLog(JsonParse.class);
	
	protected JSONObject doc;
	
	public JsonParse(String doc) {
		this.doc = JSONObject.fromObject(doc);
	}
}

package com.berheley.bi.grp.fetch.parse;

import java.util.Map;

/**
 * 
 * 类功能描述:解析统一接口
 *
 * @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
 * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp @param <T> $
 * Create:  2014-2-26 下午01:53:10
 */
public interface IParse<T> {

	/**
	 * 
	 * @function:url中以m_开头的自定义参数
	 * @param params
	 * @return
	 * @author: mengqingyu    2014-3-4 上午09:32:54
	 */
	abstract T process(Map<String, Object> params);
}

import java.util.Map;

import net.sf.json.JSONObject;

import com.berheley.bi.grp.fetch.parse.HtmlParse;


public class FyxxInfoHtmlParse extends HtmlParse<String>{

	public FyxxInfoHtmlParse(String doc) {
		super(doc);
	}

	@Override
	public String process(Map<String, Object> params) {
		JSONObject jsonObj = new JSONObject();

		//价位无
		String tfj_rentcost = doc.getElementById("tfj_rentcost").val(); //租金 
		
		String tfj_buildingarea = doc.getElementById("tfj_buildingarea")==null?"":doc.getElementById("tfj_buildingarea").val();//面积
		
		String tfj_standardstorey = doc.getElementById("tfj_standardstorey").val();// 标准层高
		
		String tfj_floorloading = doc.getElementById("tfj_floorloading_d").val();//楼面承重 tfj_floorloading_d

		String tfj_phone = doc.getElementById("tfj_phone").val();//业主单位联系方式

		String tfj_propertycost = doc.getElementById("tfj_propertycost").val();//物业

		String tfj_watercost = doc.getElementById("tfj_watercost").val();//水

		String tfj_eleccost = doc.getElementById("tfj_eleccost").val();//电

		
		jsonObj.put("rentcost", tfj_rentcost);
		jsonObj.put("buildingarea", tfj_buildingarea);
		jsonObj.put("standardstorey", tfj_standardstorey);
		jsonObj.put("floorloading", tfj_floorloading);
		jsonObj.put("phone", tfj_phone);
		jsonObj.put("propertycost", tfj_propertycost);
		jsonObj.put("watercost", tfj_watercost);
		jsonObj.put("eleccost", tfj_eleccost);
		
		jsonObj.put("success", true);
		return jsonObj.toString();
	}
	
}

/**
 * 
 * 类功能描述:常量类
 *
 * @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
 * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp  $
 * Create:  2014-2-28 下午02:37:32
 */
public final class HttpConstant {
	
	public static final String    POST 	= 	"POST";
	
	public static final String    URL 	= 	"m_url";
	
	public static final String    PARSE = 	"m_parse";
	
	public static final String    GBK 	= 	"gbk";
	
	public static final String    UTF8 	= 	"UTF-8";
}

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.lang.reflect.Constructor;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.entity.ContentType;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;

import com.berheley.bi.basic.exp.BusinessException;
import com.berheley.bi.grp.fetch.parse.IParse;
import com.berheley.bi.grp.fetch.pojo.HttpAttributes;

/**
 * 
 * 类功能描述:请求工具类
 *
 * @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
 * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp  $
 * Create:  2014-2-19 下午01:53:18
 */
public final class HttpUtils {
	
	private static Log log = LogFactory.getLog(HttpUtils.class);
	
	/**
	 * 
	 * @function:get请求
	 * @param httpclient
	 * @param url
	 * @return
	 * @author: mengqingyu    2014-2-19 下午01:50:58
	 */
	public static HttpResponse httpGet(HttpClient httpclient, String url) {
		HttpResponse response = null;
		HttpGet httpget = new HttpGet(url);
		try {
			response = httpclient.execute(httpget);
		} catch (ClientProtocolException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		log.info("get status: " + response.getStatusLine());
		return response;
	}
	
	/**
	 *  get请求
	 * @param httpclient
	 * @param url
	 * @param handler
	 * @param context  new BasicHttpContext()  可取到请求后url
	 * @return
	 */
	public static HttpResponse httpGet(HttpClient httpclient, String url, HttpContext context) {
		HttpResponse response = null;
		HttpGet httpget = new HttpGet(url);
		try {
			response = httpclient.execute(httpget, context);
		} catch (ClientProtocolException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		log.info("get status: " + response.getStatusLine());
		return response;
	}
	
	/**
	 *  get请求 包含判断是否需要登录的POST
	 * @param httpclient
	 * @param url
	 * @param handler
	 * @param context  new BasicHttpContext()  可取到请求后url
	 * @return
	 */
	public static HttpResponse httpGetByScheme(HttpClient httpclient, String url, HttpContext context, HttpAttributes attributes) {
		HttpResponse response = httpGet(httpclient, url, context);
		HttpUriRequest req = (HttpUriRequest) context.getAttribute(ExecutionContext.HTTP_REQUEST);
		log.info("get请求跳转地址: " + req.getURI());
		if(HttpConstant.POST.equalsIgnoreCase(attributes.getScheme())&&attributes.getErrorUrl().equalsIgnoreCase(req.getURI().toString())){
			httpPost(httpclient, attributes.getLoginUrl(), getPairs(attributes.getParams()));
			response = httpGet(httpclient, url, context);
		}
		log.info("get status: " + response.getStatusLine());
		return response;
	}
	
	/**
	 * 
	 * @function:post提交
	 * @param httpclient
	 * @param url
	 * @param params
	 * @return
	 * @author: mengqingyu    2014-2-19 下午01:51:38
	 */
	public static HttpResponse httpPost(HttpClient httpclient, String url, List<NameValuePair> params) {
		HttpResponse response = null;
		HttpPost httpost = new HttpPost(url);
		httpost.setEntity(new UrlEncodedFormEntity(params, Charset.forName(HttpConstant.GBK)));
//      httpost.getParams().setBooleanParameter(CoreProtocolPNames.USE_EXPECT_CONTINUE,false);
		try {
			response = httpclient.execute(httpost);
		} catch (ClientProtocolException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		log.info("post status: " + response.getStatusLine());
		return response;
	}
	
	/**
	 * 
	 * @function:主机地址
	 * @param context
	 * @return
	 * @author: mengqingyu    2014-2-19 下午01:51:57
	 */
	public static HttpHost getHttpHost(HttpContext context) {
		return (HttpHost) context.getAttribute(ExecutionContext.HTTP_TARGET_HOST);
	}
	
	/**
	 * 
	 * @function:子地址
	 * @param context
	 * @return
	 * @author: mengqingyu    2014-2-19 下午01:52:05
	 */
	public static HttpUriRequest getHttpUriRequest(HttpContext context) {
		return (HttpUriRequest) context.getAttribute(ExecutionContext.HTTP_REQUEST);
	}
	
	/**
	 * 
	 * @function:表单参数转换
	 * @param params
	 * @return
	 * @author: mengqingyu    2014-2-19 下午01:52:24
	 */
	public static List<NameValuePair> getPairs(Map<?, ?> params) {
		List<NameValuePair> nameValuePairs = new ArrayList<NameValuePair>();
		if (params != null) {
			for (Map.Entry<?, ?> entry : params.entrySet()) {
				nameValuePairs.add(new BasicNameValuePair(entry.getKey().toString(), entry.getValue().toString()));
			}
		}
		return nameValuePairs;
	}
	
	/**
	 * 
	 * @function:实体类转换html文本
	 * @param response
	 * @return
	 * @author: mengqingyu    2014-2-19 下午01:52:40
	 */
	public static String entityToString(HttpResponse response) {
		HttpEntity entity = response.getEntity();
		InputStream is = null;
		BufferedReader br = null;
		StringBuilder sb = null;
		ContentType contentType = ContentType.getOrDefault(entity);
		Charset charset = contentType.getCharset();
		if(charset==null)
			charset = Charset.forName(HttpConstant.GBK);
		try {
			is = entity.getContent();
			br = new BufferedReader(new InputStreamReader(is, charset));
			sb = new StringBuilder();
			String line = null;
			while ((line = br.readLine()) != null) {
				sb.append(line);
			}
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			try {
				is.close();
				EntityUtils.consume(entity);
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		return sb.toString();
	}
	
	/**
	 * 
	 * @function:反射生成解析策略类
	 * @param parseBean
	 * @param html
	 * @return
	 * @throws BusinessException
	 * @author: mengqingyu    2014-2-26 下午04:31:25
	 */
	@SuppressWarnings({ "rawtypes", "unchecked" })
	public static IParse<String> newInstance(String packPath, String parseBean, String text) throws BusinessException{
		IParse<String> parse = null;
		try {
			Class clazz = Class.forName(packPath+"."+parseBean);
			Constructor constructor = clazz.getConstructor(String.class);
			parse = (IParse) constructor.newInstance(text);
		} catch (Exception e) {
			throw new BusinessException("网页解析类初始化错误 "+e.getMessage(), e);
		}
        return parse;
	}
	
	/**
	 * 
	 * @function:通过url获取域名
	 * @param url
	 * @return
	 * @author: mengqingyu 2014-2-26 下午04:30:49
	 */
	public static String initParams(Map<String, Object> params) {
		String url = params.get(HttpConstant.URL).toString();
		int index = url.indexOf("?");
		if(index==-1) return url;
		String urlPath = url.substring(0, url.indexOf("?")+1);
		String paramStr = url.substring(url.indexOf("?")+1);
		String[] urlArray = paramStr.split("&");
		for (int i = 0; i < urlArray.length; i++) {
			String[] paramArray = null;
			if(urlArray[i].startsWith("m_")) {
				paramArray = urlArray[i].split("=");
				params.put(paramArray[0], paramArray[1]);
				paramStr = paramStr.replaceAll("(\\?|&)"+urlArray[i], "");
			}
		}
		paramStr = urlEncoder(paramStr);
		paramStr = paramStr.replace("%3D", "=").replace("%26", "&");
		return urlPath+paramStr;
	}
	
	/**
	 * 
	 * @function:url 编码
	 * @param paramStr
	 * @return
	 * @author: mengqingyu    2014-2-28 下午02:58:59
	 */
	public static String urlEncoder(String paramStr) {
		try {
			paramStr = URLEncoder.encode(paramStr,HttpConstant.UTF8);
		} catch (UnsupportedEncodingException e) {
			log.error("url编码错误", e);
		}
		return paramStr;
	}
}

import java.io.IOException;

import jcifs.ntlmssp.NtlmFlags;
import jcifs.ntlmssp.Type1Message;
import jcifs.ntlmssp.Type2Message;
import jcifs.ntlmssp.Type3Message;
import jcifs.util.Base64;

import org.apache.http.impl.auth.NTLMEngine;
import org.apache.http.impl.auth.NTLMEngineException;

/**
 * 
 * 类功能描述:JCIFS实现NTLM windows域验证
 *
 * @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
 * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp  $
 * Create:  2014-2-26 下午01:55:31
 */
public final class JCIFSEngine implements NTLMEngine {

	private static final int TYPE_1_FLAGS = NtlmFlags.NTLMSSP_NEGOTIATE_56 | NtlmFlags.NTLMSSP_NEGOTIATE_128 | NtlmFlags.NTLMSSP_NEGOTIATE_NTLM2
			| NtlmFlags.NTLMSSP_NEGOTIATE_ALWAYS_SIGN | NtlmFlags.NTLMSSP_REQUEST_TARGET;

	public String generateType1Msg(final String domain, final String workstation) throws NTLMEngineException {
		final Type1Message type1Message = new Type1Message(TYPE_1_FLAGS, domain, workstation);
		return Base64.encode(type1Message.toByteArray());
	}

	public String generateType3Msg(final String username, final String password, final String domain, final String workstation, final String challenge)
			throws NTLMEngineException {
		Type2Message type2Message;
		try {
			type2Message = new Type2Message(Base64.decode(challenge));
		} catch (final IOException exception) {
			throw new NTLMEngineException("Invalid NTLM type 2 message", exception);
		}
		final int type2Flags = type2Message.getFlags();
		final int type3Flags = type2Flags & (0xffffffff ^ (NtlmFlags.NTLMSSP_TARGET_TYPE_DOMAIN | NtlmFlags.NTLMSSP_TARGET_TYPE_SERVER));
		final Type3Message type3Message = new Type3Message(type2Message, password, domain, username, workstation, type3Flags);
		return Base64.encode(type3Message.toByteArray());
	}

}

import org.apache.http.auth.AuthScheme;
import org.apache.http.auth.AuthSchemeFactory;
import org.apache.http.impl.auth.NTLMScheme;
import org.apache.http.params.HttpParams;

/**
 * 
 * 类功能描述:NTLM windows域验证
 *
 * @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
 * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp  $
 * Create:  2014-2-26 下午01:54:40
 */
public class NTLMSchemeFactory implements AuthSchemeFactory {

    public AuthScheme newInstance(final HttpParams params) {
        return new NTLMScheme(new JCIFSEngine());
    }

}

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.NTCredentials;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.auth.params.AuthPNames;
import org.apache.http.client.HttpClient;
import org.apache.http.client.params.AuthPolicy;
import org.apache.http.conn.ClientConnectionManager;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.params.HttpParams;

import com.berheley.bi.grp.fetch.ntlm.NTLMSchemeFactory;
import com.berheley.bi.grp.fetch.pojo.HttpAttributes;
import com.berheley.bi.grp.fetch.util.HttpConstant;

/**
 * 
 * 类功能描述:远程登录处理类
 * 
 * @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
 * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp $
 *          Create: 2014-2-26 下午01:49:45
 */
public class HttpHandler {

	private Log log = LogFactory.getLog(HttpHandler.class);
	
	// 创建socket的上线
	private int maxTotal = 400;

	// 对每个指定连接的服务器(指定的ip)可以创建的并发数
	private int maxRoute = 200;

	// 连接超时时间
	private int cnTimeOut = 60000;

	// 数据传输超时
	private int soTimeOut = 60000;

	//连接对象
	private HttpClient httpClient;

	//连接属性设置
	private HttpParams httpParams;

	//多线程连接管理
	private ClientConnectionManager connectionManager;

	// key:IP地址,value:每个项目的属性
	private Map<String, HttpAttributes> attributes;

	public HttpHandler() {
		httpParams = this.getHp();
		connectionManager = this.getCm();
		httpClient = new DefaultHttpClient(connectionManager, httpParams);
	}

	public int getMaxTotal() {
		return maxTotal;
	}

	public void setMaxTotal(int maxTotal) {
		this.maxTotal = maxTotal;
	}

	public int getMaxRoute() {
		return maxRoute;
	}

	public void setMaxRoute(int maxRoute) {
		this.maxRoute = maxRoute;
	}

	public int getCnTimeOut() {
		return cnTimeOut;
	}

	public void setCnTimeOut(int cnTimeOut) {
		this.cnTimeOut = cnTimeOut;
	}

	public int getSoTimeOut() {
		return soTimeOut;
	}

	public void setSoTimeOut(int soTimeOut) {
		this.soTimeOut = soTimeOut;
	}

	public HttpParams getHttpParams() {
		return httpParams;
	}

	public void setHttpParams(HttpParams httpParams) {
		this.httpParams = httpParams;
	}

	public ClientConnectionManager getConnectionManager() {
		return connectionManager;
	}

	public void setConnectionManager(ClientConnectionManager connectionManager) {
		this.connectionManager = connectionManager;
	}

	public Map<String, HttpAttributes> getAttributes() {
		return attributes;
	}

	public void setAttributes(Map<String, HttpAttributes> attributes) {
		this.attributes = attributes;
	}

	/**
	 * 
	 * @function:初始化 HttpClient
	 * @author: mengqingyu 2014-2-26 下午02:57:09
	 */
	public void init() {
		for (Entry<String, HttpAttributes> entry : attributes.entrySet()) {
			HttpAttributes attributes = entry.getValue();
			String scheme = attributes.getScheme();
			DefaultHttpClient httpClient = null;
			if (AuthPolicy.NTLM.equalsIgnoreCase(scheme)) {
				httpClient = new DefaultHttpClient(connectionManager, httpParams);
				List<String> authpref = new ArrayList<String>();
				authpref.add(AuthPolicy.NTLM);
				httpClient.getParams().setParameter(AuthPNames.TARGET_AUTH_PREF, authpref);
//				httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY,CookiePolicy.BEST_MATCH);
				httpClient.getAuthSchemes().register(AuthPolicy.NTLM, new NTLMSchemeFactory());
				NTCredentials creds = new NTCredentials(attributes.getParams().get("username"), attributes.getParams().get("password"), "", "");
				httpClient.getCredentialsProvider().setCredentials(AuthScope.ANY, creds);
				attributes.setHttpClient(httpClient);
			} else if (AuthPolicy.BASIC.equalsIgnoreCase(scheme)) {
				httpClient = new DefaultHttpClient(connectionManager, httpParams);
				httpClient.getCredentialsProvider().setCredentials(new AuthScope(attributes.getDomain(), attributes.getPort()),
						new UsernamePasswordCredentials(attributes.getParams().get("username"), attributes.getParams().get("password")));
				attributes.setHttpClient(httpClient);
			} else if (HttpConstant.POST.equalsIgnoreCase(scheme)) {
				attributes.setHttpClient(this.httpClient);
			}
		}
		log.info("初始化 HttpClient");
	}

	/**
	 * 
	 * @function:连接属性设置
	 * @return
	 * @author: mengqingyu 2014-2-26 下午02:56:49
	 */
	private HttpParams getHp() {
		HttpParams params = new BasicHttpParams();
		params.setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, cnTimeOut);
		params.setParameter(CoreConnectionPNames.SO_TIMEOUT, soTimeOut);
		return params;
	}

	/**
	 * 
	 * @function:多线程连接设置
	 * @return
	 * @author: mengqingyu 2014-2-26 下午02:56:49
	 */
	private ClientConnectionManager getCm() {
		SchemeRegistry schemeRegistry = new SchemeRegistry();
		schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
		schemeRegistry.register(new Scheme("https", 433, PlainSocketFactory.getSocketFactory()));
		PoolingClientConnectionManager cm = new PoolingClientConnectionManager(schemeRegistry);
		cm.setMaxTotal(maxTotal);
		cm.setDefaultMaxPerRoute(maxRoute);
		return cm;
	}

	/**
	 * 
	 * @function:获得项目配置
	 * @param url
	 * @return
	 * @author: mengqingyu 2014-2-27 上午09:52:53
	 */
	public HttpAttributes getHttpAttributes(String url) {
		url = url.substring(url.indexOf("://") + 3);
		url = url.substring(0, url.indexOf("/"));
		return attributes.get(url);
	}
}

import java.util.Map;

import com.berheley.bi.basic.exp.BusinessException;

/**
 * 
 * 类功能描述:抓取网站业务类
 *
 * @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
 * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp  $
 * Create:  2014-2-26 下午01:56:24
 */
public interface IFetchService {
	
	/**
	 * 
	 * @function:抓取并解析数据
	 * @param params 包含以下
	 * @param 包含key为:m_url必传参数  每次请求全路径包含参数  在参数内的地址后需要包含参数m_parse
	 * @return
	 * @throws BusinessException
	 * @author: mengqingyu    2014-2-26 下午01:56:38
	 */
	public String findDate(Map<String,Object> params) throws BusinessException;
}

import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpResponse;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.HttpContext;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import com.berheley.bi.basic.exp.BusinessException;
import com.berheley.bi.grp.fetch.handler.HttpHandler;
import com.berheley.bi.grp.fetch.parse.IParse;
import com.berheley.bi.grp.fetch.pojo.HttpAttributes;
import com.berheley.bi.grp.fetch.util.HttpConstant;
import com.berheley.bi.grp.fetch.util.HttpUtils;

/**
 * 
 * 类功能描述:抓取解析业务实现类
 * 
 * @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
 * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp $
 *          Create: 2014-2-26 下午01:56:57
 */
@Service
public class FetchService implements IFetchService {

	private Log log = LogFactory.getLog(FetchService.class);

	@Autowired
	private HttpHandler httpHandler;

	@Override
	public String findDate(Map<String, Object> params) throws BusinessException {
		String url = HttpUtils.initParams(params);
		HttpAttributes attributes = httpHandler.getHttpAttributes(url);
		DefaultHttpClient httpclient = (DefaultHttpClient) attributes.getHttpClient();
		HttpContext localContext = new BasicHttpContext();
		HttpResponse response = HttpUtils.httpGetByScheme(httpclient, url, localContext, attributes);
		String result = HttpUtils.entityToString(response);
		IParse<String> parse = HttpUtils.newInstance(attributes.getPackPath(), params.get(HttpConstant.PARSE).toString(), result);
		String json = parse.process(params);
		log.info(json);
		return json;
	}

}
分享到:
评论

相关推荐

    java httpclient 抓取 数据 和jar 包

    HttpClient库不仅支持基本的HTTP协议,还支持HTTPS以及一些高级特性,如Cookie管理、身份验证等。 **二、配置HttpClient** 首先,确保你的项目已经引入了Apache HttpClient库。如果你使用Maven,可以在pom.xml文件...

    httpClient采集jsoup解析

    例如,如果你想要抓取一个新闻网站的最新文章标题,可以先使用HttpClient发送请求获取网页源码,再用Jsoup解析HTML,找到包含文章标题的元素: ```java CloseableHttpClient httpClient = HttpClients.create...

    httpClient+jsoup抓取网页数据实例和jar包

    HttpClient是Apache基金会开发的一个开放源代码库,它允许开发者发送HTTP请求并接收响应,支持多种HTTP协议版本。HttpClient提供了丰富的功能,包括连接管理、重试策略、身份验证等,使得网络通信变得更加方便和可靠...

    HttpClient网页抓取工具包整合

    - HttpClient常用于网页抓取,配合HTML解析库(如Jsoup)可以提取网页数据。 - 在自动化测试中,模拟用户请求以验证服务器端功能。 在实际使用中,HttpClient的灵活性和强大功能使其成为Java开发者的首选HTTP...

    HttpClient 登录163邮箱

    HttpClient库提供了对HTTP协议的全面支持,包括各种HTTP方法(GET、POST、PUT等)、重定向处理、身份验证、cookie管理等。它的设计目标是为开发者提供一个强大、灵活且易于使用的HTTP客户端API。 2. **登录流程** ...

    httpclient抓取网页数据和所需的10个jar包

    使用HttpClient抓取网页数据的基本步骤如下: 1. **创建HttpClient对象**:首先,你需要创建一个HttpClient实例,这通常是通过`HttpClientBuilder`或`HttpClients`类完成的。 2. **构建HttpGet请求**:对于简单的...

    httpclient 静态化网站 project

    【标题】"httpclient 静态化网站 project"是一个基于Apache HttpClient库的项目,旨在实现网站内容的抓取和静态化。HttpClient是一个强大的HTTP客户端编程工具包,它允许开发者在Java环境中执行HTTP请求,获取网页...

    httpclient远程网页抓取工具

    HTTPClient是Apache软件基金会的 HttpClient项目提供的一款Java库,它为Java程序员提供了强大的HTTP客户端功能,使得能够方便地进行网络请求和网页抓取。在本文中,我们将深入探讨HTTPClient库的基本概念、主要功能...

    httpclient

    3. **身份验证**:支持多种认证机制,如Basic Auth、Digest Auth,便于处理需要身份验证的网站。 4. **重试机制**:当请求失败时,HttpClient可以自动重试,提高了爬虫的稳定性。 5. **HTTP/2支持**:HttpClient ...

    httpclient4.5.5所有包

    - **安全性**:支持 SSL/TLS 安全连接,以及多种身份验证机制,如 Basic、Digest、NTLM 和 Kerberos。 4. **使用示例** 创建一个 HttpClient 实例,设置请求参数,执行请求并处理响应: ```java ...

    httpclient4.3中文教程

    此外,HttpClient 还支持设置请求头、身份验证、连接管理、超时控制等高级功能,以满足各种复杂的网络通信需求。 HttpClient 不是浏览器,它不会解析 HTML 内容或执行 JavaScript。它是一个低级别的库,主要用于...

    httpclient jar

    6. **身份验证**:支持多种认证机制,如Basic、Digest、NTLM等。 7. **自定义性**:HttpClient提供了丰富的API,开发者可以根据需求进行高度定制。 五、HttpClient的应用场景 HttpClient广泛应用于各种需要HTTP...

    httpclient-4.5.6.rar

    2. **认证与安全**:此版本加强了身份验证和安全特性,支持多种认证机制(如 Basic、Digest、NTLM 和 Kerberos),并提供了 TLS/SSL 支持,确保数据传输的安全性。 3. **缓存机制**:HttpClient 4.5.6 提供了 HTTP ...

    httpclient-4.4

    5. **身份认证与安全**:HttpClient 4.4支持多种身份验证机制,包括基本认证、摘要认证、NTLM和Kerberos等。此外,它还集成了SSL/TLS,可以进行加密通信,保证了数据传输的安全性。 6. **异步操作**:HttpClient ...

    HttpClient4.1.2中英文文档

    - **初始化HttpClient**:了解如何创建HttpClient实例,设置基本配置,如默认主机名验证、超时设置等。 - **执行HTTP请求**:学习如何构造HttpGet、HttpPost等请求对象,并添加请求头和请求体。同时,理解如何使用...

    org.apache.commons.httpclient-3.1.jar

    6. **身份验证**:支持多种身份验证机制,如基本、摘要、NTLM和Kerberos等。 除了核心的HTTP请求处理功能,HttpClient 3.1还包含了一些辅助类和工具,如Cookie管理、URL编码和解码、HTTP状态管理和响应解析等。这些...

    HttpClient4.2.5

    5. **Authentication**:HttpClient支持多种认证机制,包括基本认证、摘要认证和NTLM认证,可处理服务器和代理的身份验证需求。 6. **Cookie Management**:通过`CookieStore`和`CookiePolicy`,HttpClient可以处理...

    httpclient-4.5所需jar包.rar

    4. **认证和安全**:HTTPClient支持多种身份验证机制,包括基本认证、摘要认证以及NTLM等。此外,它还支持HTTPS协议,可以处理SSL/TLS加密,确保数据传输的安全性。 5. **重试策略**:当遇到网络问题或服务器错误时...

    httpclient-4.5 jar包

    《HttpClient 4.5:构建高效HTTP客户端的基石》 HttpClient是Apache软件基金会的一个开源项目,主要用于提供HTTP协议的客户端...无论是简单的网页抓取,还是复杂的Web服务交互,HttpClient 4.5都能提供可靠的支持。

    httpClient

    4. **身份验证和安全**:HttpClient支持多种认证机制,如Basic、Digest、NTLM等,并且可以处理HTTPS协议,提供SSL/TLS加密。 5. **重试和恢复策略**:当遇到网络问题时,HttpClient可以自动重试请求,或者根据...

Global site tag (gtag.js) - Google Analytics