`
mengqingyu
  • 浏览: 328535 次
  • 性别: Icon_minigender_1
  • 来自: 天津
社区版块
存档分类
最新评论

HttpClient抓取解析网站支持多种验证方式

阅读更多
工作中遇到了抓取多个项目数据并且有多种验证方式包括Http标准验证和非标准验证NTLM、BASIC,写了个较通用的抓数框架,支持多线程,用到的开源框架有HttpClient 4.23、Jsoup、JSONObject、Spring3.0,注意HttpClient版本不同版本API不同。
设计思路:基于bean+spring配置文件方式,配置多个项目属性,实现项目自动登录,实现通用接口或抽象类,自定义解析类,最后通过url传参,反射实例化对象,实现方法的通用。

<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="
		http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-3.0.xsd">

	<!-- 登陆网站设置 可配置多个登陆网站策略 -->
	<bean class="com.berheley.bi.grp.fetch.handler.HttpHandler" init-method="init">
		<property name="maxTotal" value="400"/>
		<property name="maxRoute" value="200"/>
		<property name="cnTimeOut" value="60000"/>
		<property name="soTimeOut" value="60000"/>
		<property name="attributes">
			<map>
				<entry key="60.28.43.164"> <!-- 域名或IP地址、端口号 -->
					<bean class="com.berheley.bi.grp.fetch.pojo.HttpAttributes">
						<property name="packPath" value="com.berheley.bi.grp.fetch.custom.business"/>
						<property name="domain" value="60.28.43.164"/>
						<property name="port" value="80"/>   
						<property name="loginUrl" value=""/>	<!-- 登陆提交表单全路径地址  如http://www.iteye.com/login.jsp -->
						<property name="errorUrl" value=""/>	<!-- 登陆失败之后的请求地址 如/error.jsp -->
						<property name="scheme" value="NTLM"/>
						<property name="params">
							<map>
								<entry key="username" value="登录名"/>
								<entry key="password" value="密码"/>
							</map>
						</property>
					</bean>
				</entry>
			</map>
		</property>
	</bean>
</beans>


import java.util.Map;

import org.apache.http.client.HttpClient;

/**
 * 
 * 类功能描述:远程登录项目属性类
 *
 * @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
 * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp  $
 * Create:  2014-2-26 下午01:49:02
 */
public class HttpAttributes {
	
	//发请求对象
	private HttpClient httpClient;
	
	//解析当前域下网页类的包路径
	private String packPath;
	
	//域名或IP地址
	private String domain = ""; 
	
	//登陆提交表单全路径地址  如http://www.iteye.com/login.jsp
	private String loginUrl = ""; 
	
	//登陆失败之后的请求地址 如/error.jsp
	private String errorUrl = ""; 
	
	//端口号
	private int port = 80;
	
	//登陆参数
	private Map<String,String> params;
	
	//验证方式
	private String scheme;

	public HttpClient getHttpClient() {
		return httpClient;
	}

	public void setHttpClient(HttpClient httpClient) {
		this.httpClient = httpClient;
	}

	public String getPackPath() {
		return packPath;
	}

	public void setPackPath(String packPath) {
		this.packPath = packPath;
	}

	public String getDomain() {
		return domain;
	}

	public void setDomain(String domain) {
		this.domain = domain;
	}

	public String getLoginUrl() {
		return loginUrl;
	}

	public void setLoginUrl(String loginUrl) {
		this.loginUrl = loginUrl;
	}

	public String getErrorUrl() {
		return errorUrl;
	}

	public void setErrorUrl(String errorUrl) {
		this.errorUrl = errorUrl;
	}

	public int getPort() {
		return port;
	}

	public void setPort(int port) {
		this.port = port;
	}

	public Map<String, String> getParams() {
		return params;
	}

	public void setParams(Map<String, String> params) {
		this.params = params;
	}

	public String getScheme() {
		return scheme;
	}

	public void setScheme(String scheme) {
		this.scheme = scheme;
	}
}

import java.util.Map;

/**
 * 
 * 类功能描述:解析统一接口
 *
 * @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
 * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp @param <T> $
 * Create:  2014-2-26 下午01:53:10
 */
public interface IParse<T> {

	/**
	 * 
	 * @function:url中以m_开头的自定义参数
	 * @param params
	 * @return
	 * @author: mengqingyu    2014-3-4 上午09:32:54
	 */
	abstract T process(Map<String, Object> params);
}

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

/**
 * 
 * 类功能描述:解析html网页抽象类,解析html可以继承扩展此类,如有通用方法可以写到此类中,进一步完善
 *
 * @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
 * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp  $
 * Create:  2014-2-19 下午01:53:53
 * @param <T>
 */
public abstract class HtmlParse<T> implements IParse<T>{
	
	protected Log log = LogFactory.getLog(HtmlParse.class);
	
	protected Document doc;
	
	public HtmlParse(String doc) {
		this.doc = Jsoup.parse(doc);
	}
}

import net.sf.json.JSONObject;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * 
 * 类功能描述:解析html网页抽象类,解析html可以继承扩展此类,如有通用方法可以写到此类中,进一步完善
 *
 * @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
 * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp  $
 * Create:  2014-2-19 下午01:53:53
 * @param <T>
 */
public abstract class JsonParse<T> implements IParse<T>{
	
	protected Log log = LogFactory.getLog(JsonParse.class);
	
	protected JSONObject doc;
	
	public JsonParse(String doc) {
		this.doc = JSONObject.fromObject(doc);
	}
}

package com.berheley.bi.grp.fetch.parse;

import java.util.Map;

/**
 * 
 * 类功能描述:解析统一接口
 *
 * @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
 * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp @param <T> $
 * Create:  2014-2-26 下午01:53:10
 */
public interface IParse<T> {

	/**
	 * 
	 * @function:url中以m_开头的自定义参数
	 * @param params
	 * @return
	 * @author: mengqingyu    2014-3-4 上午09:32:54
	 */
	abstract T process(Map<String, Object> params);
}

import java.util.Map;

import net.sf.json.JSONObject;

import com.berheley.bi.grp.fetch.parse.HtmlParse;


public class FyxxInfoHtmlParse extends HtmlParse<String>{

	public FyxxInfoHtmlParse(String doc) {
		super(doc);
	}

	@Override
	public String process(Map<String, Object> params) {
		JSONObject jsonObj = new JSONObject();

		//价位无
		String tfj_rentcost = doc.getElementById("tfj_rentcost").val(); //租金 
		
		String tfj_buildingarea = doc.getElementById("tfj_buildingarea")==null?"":doc.getElementById("tfj_buildingarea").val();//面积
		
		String tfj_standardstorey = doc.getElementById("tfj_standardstorey").val();// 标准层高
		
		String tfj_floorloading = doc.getElementById("tfj_floorloading_d").val();//楼面承重 tfj_floorloading_d

		String tfj_phone = doc.getElementById("tfj_phone").val();//业主单位联系方式

		String tfj_propertycost = doc.getElementById("tfj_propertycost").val();//物业

		String tfj_watercost = doc.getElementById("tfj_watercost").val();//水

		String tfj_eleccost = doc.getElementById("tfj_eleccost").val();//电

		
		jsonObj.put("rentcost", tfj_rentcost);
		jsonObj.put("buildingarea", tfj_buildingarea);
		jsonObj.put("standardstorey", tfj_standardstorey);
		jsonObj.put("floorloading", tfj_floorloading);
		jsonObj.put("phone", tfj_phone);
		jsonObj.put("propertycost", tfj_propertycost);
		jsonObj.put("watercost", tfj_watercost);
		jsonObj.put("eleccost", tfj_eleccost);
		
		jsonObj.put("success", true);
		return jsonObj.toString();
	}
	
}

/**
 * 
 * 类功能描述:常量类
 *
 * @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
 * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp  $
 * Create:  2014-2-28 下午02:37:32
 */
public final class HttpConstant {
	
	public static final String    POST 	= 	"POST";
	
	public static final String    URL 	= 	"m_url";
	
	public static final String    PARSE = 	"m_parse";
	
	public static final String    GBK 	= 	"gbk";
	
	public static final String    UTF8 	= 	"UTF-8";
}

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.lang.reflect.Constructor;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.entity.ContentType;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;

import com.berheley.bi.basic.exp.BusinessException;
import com.berheley.bi.grp.fetch.parse.IParse;
import com.berheley.bi.grp.fetch.pojo.HttpAttributes;

/**
 * 
 * 类功能描述:请求工具类
 *
 * @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
 * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp  $
 * Create:  2014-2-19 下午01:53:18
 */
public final class HttpUtils {
	
	private static Log log = LogFactory.getLog(HttpUtils.class);
	
	/**
	 * 
	 * @function:get请求
	 * @param httpclient
	 * @param url
	 * @return
	 * @author: mengqingyu    2014-2-19 下午01:50:58
	 */
	public static HttpResponse httpGet(HttpClient httpclient, String url) {
		HttpResponse response = null;
		HttpGet httpget = new HttpGet(url);
		try {
			response = httpclient.execute(httpget);
		} catch (ClientProtocolException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		log.info("get status: " + response.getStatusLine());
		return response;
	}
	
	/**
	 *  get请求
	 * @param httpclient
	 * @param url
	 * @param handler
	 * @param context  new BasicHttpContext()  可取到请求后url
	 * @return
	 */
	public static HttpResponse httpGet(HttpClient httpclient, String url, HttpContext context) {
		HttpResponse response = null;
		HttpGet httpget = new HttpGet(url);
		try {
			response = httpclient.execute(httpget, context);
		} catch (ClientProtocolException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		log.info("get status: " + response.getStatusLine());
		return response;
	}
	
	/**
	 *  get请求 包含判断是否需要登录的POST
	 * @param httpclient
	 * @param url
	 * @param handler
	 * @param context  new BasicHttpContext()  可取到请求后url
	 * @return
	 */
	public static HttpResponse httpGetByScheme(HttpClient httpclient, String url, HttpContext context, HttpAttributes attributes) {
		HttpResponse response = httpGet(httpclient, url, context);
		HttpUriRequest req = (HttpUriRequest) context.getAttribute(ExecutionContext.HTTP_REQUEST);
		log.info("get请求跳转地址: " + req.getURI());
		if(HttpConstant.POST.equalsIgnoreCase(attributes.getScheme())&&attributes.getErrorUrl().equalsIgnoreCase(req.getURI().toString())){
			httpPost(httpclient, attributes.getLoginUrl(), getPairs(attributes.getParams()));
			response = httpGet(httpclient, url, context);
		}
		log.info("get status: " + response.getStatusLine());
		return response;
	}
	
	/**
	 * 
	 * @function:post提交
	 * @param httpclient
	 * @param url
	 * @param params
	 * @return
	 * @author: mengqingyu    2014-2-19 下午01:51:38
	 */
	public static HttpResponse httpPost(HttpClient httpclient, String url, List<NameValuePair> params) {
		HttpResponse response = null;
		HttpPost httpost = new HttpPost(url);
		httpost.setEntity(new UrlEncodedFormEntity(params, Charset.forName(HttpConstant.GBK)));
//      httpost.getParams().setBooleanParameter(CoreProtocolPNames.USE_EXPECT_CONTINUE,false);
		try {
			response = httpclient.execute(httpost);
		} catch (ClientProtocolException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		log.info("post status: " + response.getStatusLine());
		return response;
	}
	
	/**
	 * 
	 * @function:主机地址
	 * @param context
	 * @return
	 * @author: mengqingyu    2014-2-19 下午01:51:57
	 */
	public static HttpHost getHttpHost(HttpContext context) {
		return (HttpHost) context.getAttribute(ExecutionContext.HTTP_TARGET_HOST);
	}
	
	/**
	 * 
	 * @function:子地址
	 * @param context
	 * @return
	 * @author: mengqingyu    2014-2-19 下午01:52:05
	 */
	public static HttpUriRequest getHttpUriRequest(HttpContext context) {
		return (HttpUriRequest) context.getAttribute(ExecutionContext.HTTP_REQUEST);
	}
	
	/**
	 * 
	 * @function:表单参数转换
	 * @param params
	 * @return
	 * @author: mengqingyu    2014-2-19 下午01:52:24
	 */
	public static List<NameValuePair> getPairs(Map<?, ?> params) {
		List<NameValuePair> nameValuePairs = new ArrayList<NameValuePair>();
		if (params != null) {
			for (Map.Entry<?, ?> entry : params.entrySet()) {
				nameValuePairs.add(new BasicNameValuePair(entry.getKey().toString(), entry.getValue().toString()));
			}
		}
		return nameValuePairs;
	}
	
	/**
	 * 
	 * @function:实体类转换html文本
	 * @param response
	 * @return
	 * @author: mengqingyu    2014-2-19 下午01:52:40
	 */
	public static String entityToString(HttpResponse response) {
		HttpEntity entity = response.getEntity();
		InputStream is = null;
		BufferedReader br = null;
		StringBuilder sb = null;
		ContentType contentType = ContentType.getOrDefault(entity);
		Charset charset = contentType.getCharset();
		if(charset==null)
			charset = Charset.forName(HttpConstant.GBK);
		try {
			is = entity.getContent();
			br = new BufferedReader(new InputStreamReader(is, charset));
			sb = new StringBuilder();
			String line = null;
			while ((line = br.readLine()) != null) {
				sb.append(line);
			}
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			try {
				is.close();
				EntityUtils.consume(entity);
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		return sb.toString();
	}
	
	/**
	 * 
	 * @function:反射生成解析策略类
	 * @param parseBean
	 * @param html
	 * @return
	 * @throws BusinessException
	 * @author: mengqingyu    2014-2-26 下午04:31:25
	 */
	@SuppressWarnings({ "rawtypes", "unchecked" })
	public static IParse<String> newInstance(String packPath, String parseBean, String text) throws BusinessException{
		IParse<String> parse = null;
		try {
			Class clazz = Class.forName(packPath+"."+parseBean);
			Constructor constructor = clazz.getConstructor(String.class);
			parse = (IParse) constructor.newInstance(text);
		} catch (Exception e) {
			throw new BusinessException("网页解析类初始化错误 "+e.getMessage(), e);
		}
        return parse;
	}
	
	/**
	 * 
	 * @function:通过url获取域名
	 * @param url
	 * @return
	 * @author: mengqingyu 2014-2-26 下午04:30:49
	 */
	public static String initParams(Map<String, Object> params) {
		String url = params.get(HttpConstant.URL).toString();
		int index = url.indexOf("?");
		if(index==-1) return url;
		String urlPath = url.substring(0, url.indexOf("?")+1);
		String paramStr = url.substring(url.indexOf("?")+1);
		String[] urlArray = paramStr.split("&");
		for (int i = 0; i < urlArray.length; i++) {
			String[] paramArray = null;
			if(urlArray[i].startsWith("m_")) {
				paramArray = urlArray[i].split("=");
				params.put(paramArray[0], paramArray[1]);
				paramStr = paramStr.replaceAll("(\\?|&)"+urlArray[i], "");
			}
		}
		paramStr = urlEncoder(paramStr);
		paramStr = paramStr.replace("%3D", "=").replace("%26", "&");
		return urlPath+paramStr;
	}
	
	/**
	 * 
	 * @function:url 编码
	 * @param paramStr
	 * @return
	 * @author: mengqingyu    2014-2-28 下午02:58:59
	 */
	public static String urlEncoder(String paramStr) {
		try {
			paramStr = URLEncoder.encode(paramStr,HttpConstant.UTF8);
		} catch (UnsupportedEncodingException e) {
			log.error("url编码错误", e);
		}
		return paramStr;
	}
}

import java.io.IOException;

import jcifs.ntlmssp.NtlmFlags;
import jcifs.ntlmssp.Type1Message;
import jcifs.ntlmssp.Type2Message;
import jcifs.ntlmssp.Type3Message;
import jcifs.util.Base64;

import org.apache.http.impl.auth.NTLMEngine;
import org.apache.http.impl.auth.NTLMEngineException;

/**
 * 
 * 类功能描述:JCIFS实现NTLM windows域验证
 *
 * @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
 * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp  $
 * Create:  2014-2-26 下午01:55:31
 */
public final class JCIFSEngine implements NTLMEngine {

	private static final int TYPE_1_FLAGS = NtlmFlags.NTLMSSP_NEGOTIATE_56 | NtlmFlags.NTLMSSP_NEGOTIATE_128 | NtlmFlags.NTLMSSP_NEGOTIATE_NTLM2
			| NtlmFlags.NTLMSSP_NEGOTIATE_ALWAYS_SIGN | NtlmFlags.NTLMSSP_REQUEST_TARGET;

	public String generateType1Msg(final String domain, final String workstation) throws NTLMEngineException {
		final Type1Message type1Message = new Type1Message(TYPE_1_FLAGS, domain, workstation);
		return Base64.encode(type1Message.toByteArray());
	}

	public String generateType3Msg(final String username, final String password, final String domain, final String workstation, final String challenge)
			throws NTLMEngineException {
		Type2Message type2Message;
		try {
			type2Message = new Type2Message(Base64.decode(challenge));
		} catch (final IOException exception) {
			throw new NTLMEngineException("Invalid NTLM type 2 message", exception);
		}
		final int type2Flags = type2Message.getFlags();
		final int type3Flags = type2Flags & (0xffffffff ^ (NtlmFlags.NTLMSSP_TARGET_TYPE_DOMAIN | NtlmFlags.NTLMSSP_TARGET_TYPE_SERVER));
		final Type3Message type3Message = new Type3Message(type2Message, password, domain, username, workstation, type3Flags);
		return Base64.encode(type3Message.toByteArray());
	}

}

import org.apache.http.auth.AuthScheme;
import org.apache.http.auth.AuthSchemeFactory;
import org.apache.http.impl.auth.NTLMScheme;
import org.apache.http.params.HttpParams;

/**
 * 
 * 类功能描述:NTLM windows域验证
 *
 * @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
 * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp  $
 * Create:  2014-2-26 下午01:54:40
 */
public class NTLMSchemeFactory implements AuthSchemeFactory {

    public AuthScheme newInstance(final HttpParams params) {
        return new NTLMScheme(new JCIFSEngine());
    }

}

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.NTCredentials;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.auth.params.AuthPNames;
import org.apache.http.client.HttpClient;
import org.apache.http.client.params.AuthPolicy;
import org.apache.http.conn.ClientConnectionManager;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.params.HttpParams;

import com.berheley.bi.grp.fetch.ntlm.NTLMSchemeFactory;
import com.berheley.bi.grp.fetch.pojo.HttpAttributes;
import com.berheley.bi.grp.fetch.util.HttpConstant;

/**
 * 
 * 类功能描述:远程登录处理类
 * 
 * @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
 * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp $
 *          Create: 2014-2-26 下午01:49:45
 */
public class HttpHandler {

	private Log log = LogFactory.getLog(HttpHandler.class);
	
	// 创建socket的上线
	private int maxTotal = 400;

	// 对每个指定连接的服务器(指定的ip)可以创建的并发数
	private int maxRoute = 200;

	// 连接超时时间
	private int cnTimeOut = 60000;

	// 数据传输超时
	private int soTimeOut = 60000;

	//连接对象
	private HttpClient httpClient;

	//连接属性设置
	private HttpParams httpParams;

	//多线程连接管理
	private ClientConnectionManager connectionManager;

	// key:IP地址,value:每个项目的属性
	private Map<String, HttpAttributes> attributes;

	public HttpHandler() {
		httpParams = this.getHp();
		connectionManager = this.getCm();
		httpClient = new DefaultHttpClient(connectionManager, httpParams);
	}

	public int getMaxTotal() {
		return maxTotal;
	}

	public void setMaxTotal(int maxTotal) {
		this.maxTotal = maxTotal;
	}

	public int getMaxRoute() {
		return maxRoute;
	}

	public void setMaxRoute(int maxRoute) {
		this.maxRoute = maxRoute;
	}

	public int getCnTimeOut() {
		return cnTimeOut;
	}

	public void setCnTimeOut(int cnTimeOut) {
		this.cnTimeOut = cnTimeOut;
	}

	public int getSoTimeOut() {
		return soTimeOut;
	}

	public void setSoTimeOut(int soTimeOut) {
		this.soTimeOut = soTimeOut;
	}

	public HttpParams getHttpParams() {
		return httpParams;
	}

	public void setHttpParams(HttpParams httpParams) {
		this.httpParams = httpParams;
	}

	public ClientConnectionManager getConnectionManager() {
		return connectionManager;
	}

	public void setConnectionManager(ClientConnectionManager connectionManager) {
		this.connectionManager = connectionManager;
	}

	public Map<String, HttpAttributes> getAttributes() {
		return attributes;
	}

	public void setAttributes(Map<String, HttpAttributes> attributes) {
		this.attributes = attributes;
	}

	/**
	 * 
	 * @function:初始化 HttpClient
	 * @author: mengqingyu 2014-2-26 下午02:57:09
	 */
	public void init() {
		for (Entry<String, HttpAttributes> entry : attributes.entrySet()) {
			HttpAttributes attributes = entry.getValue();
			String scheme = attributes.getScheme();
			DefaultHttpClient httpClient = null;
			if (AuthPolicy.NTLM.equalsIgnoreCase(scheme)) {
				httpClient = new DefaultHttpClient(connectionManager, httpParams);
				List<String> authpref = new ArrayList<String>();
				authpref.add(AuthPolicy.NTLM);
				httpClient.getParams().setParameter(AuthPNames.TARGET_AUTH_PREF, authpref);
//				httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY,CookiePolicy.BEST_MATCH);
				httpClient.getAuthSchemes().register(AuthPolicy.NTLM, new NTLMSchemeFactory());
				NTCredentials creds = new NTCredentials(attributes.getParams().get("username"), attributes.getParams().get("password"), "", "");
				httpClient.getCredentialsProvider().setCredentials(AuthScope.ANY, creds);
				attributes.setHttpClient(httpClient);
			} else if (AuthPolicy.BASIC.equalsIgnoreCase(scheme)) {
				httpClient = new DefaultHttpClient(connectionManager, httpParams);
				httpClient.getCredentialsProvider().setCredentials(new AuthScope(attributes.getDomain(), attributes.getPort()),
						new UsernamePasswordCredentials(attributes.getParams().get("username"), attributes.getParams().get("password")));
				attributes.setHttpClient(httpClient);
			} else if (HttpConstant.POST.equalsIgnoreCase(scheme)) {
				attributes.setHttpClient(this.httpClient);
			}
		}
		log.info("初始化 HttpClient");
	}

	/**
	 * 
	 * @function:连接属性设置
	 * @return
	 * @author: mengqingyu 2014-2-26 下午02:56:49
	 */
	private HttpParams getHp() {
		HttpParams params = new BasicHttpParams();
		params.setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, cnTimeOut);
		params.setParameter(CoreConnectionPNames.SO_TIMEOUT, soTimeOut);
		return params;
	}

	/**
	 * 
	 * @function:多线程连接设置
	 * @return
	 * @author: mengqingyu 2014-2-26 下午02:56:49
	 */
	private ClientConnectionManager getCm() {
		SchemeRegistry schemeRegistry = new SchemeRegistry();
		schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
		schemeRegistry.register(new Scheme("https", 433, PlainSocketFactory.getSocketFactory()));
		PoolingClientConnectionManager cm = new PoolingClientConnectionManager(schemeRegistry);
		cm.setMaxTotal(maxTotal);
		cm.setDefaultMaxPerRoute(maxRoute);
		return cm;
	}

	/**
	 * 
	 * @function:获得项目配置
	 * @param url
	 * @return
	 * @author: mengqingyu 2014-2-27 上午09:52:53
	 */
	public HttpAttributes getHttpAttributes(String url) {
		url = url.substring(url.indexOf("://") + 3);
		url = url.substring(0, url.indexOf("/"));
		return attributes.get(url);
	}
}

import java.util.Map;

import com.berheley.bi.basic.exp.BusinessException;

/**
 * 
 * 类功能描述:抓取网站业务类
 *
 * @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
 * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp  $
 * Create:  2014-2-26 下午01:56:24
 */
public interface IFetchService {
	
	/**
	 * 
	 * @function:抓取并解析数据
	 * @param params 包含以下
	 * @param 包含key为:m_url必传参数  每次请求全路径包含参数  在参数内的地址后需要包含参数m_parse
	 * @return
	 * @throws BusinessException
	 * @author: mengqingyu    2014-2-26 下午01:56:38
	 */
	public String findDate(Map<String,Object> params) throws BusinessException;
}

import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpResponse;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.HttpContext;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import com.berheley.bi.basic.exp.BusinessException;
import com.berheley.bi.grp.fetch.handler.HttpHandler;
import com.berheley.bi.grp.fetch.parse.IParse;
import com.berheley.bi.grp.fetch.pojo.HttpAttributes;
import com.berheley.bi.grp.fetch.util.HttpConstant;
import com.berheley.bi.grp.fetch.util.HttpUtils;

/**
 * 
 * 类功能描述:抓取解析业务实现类
 * 
 * @author <a href="mailto:qingyu.meng21@gmail.com">mengqingyu </a>
 * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp $
 *          Create: 2014-2-26 下午01:56:57
 */
@Service
public class FetchService implements IFetchService {

	private Log log = LogFactory.getLog(FetchService.class);

	@Autowired
	private HttpHandler httpHandler;

	@Override
	public String findDate(Map<String, Object> params) throws BusinessException {
		String url = HttpUtils.initParams(params);
		HttpAttributes attributes = httpHandler.getHttpAttributes(url);
		DefaultHttpClient httpclient = (DefaultHttpClient) attributes.getHttpClient();
		HttpContext localContext = new BasicHttpContext();
		HttpResponse response = HttpUtils.httpGetByScheme(httpclient, url, localContext, attributes);
		String result = HttpUtils.entityToString(response);
		IParse<String> parse = HttpUtils.newInstance(attributes.getPackPath(), params.get(HttpConstant.PARSE).toString(), result);
		String json = parse.process(params);
		log.info(json);
		return json;
	}

}
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics