`
MauerSu
  • 浏览: 513760 次
  • 性别: Icon_minigender_1
  • 来自: 北京
文章分类
社区版块
存档分类
最新评论

HTTPClient Referer 解决403 forbidden 跳过防盗链

 
阅读更多
源:http://blog.csdn.net/zcwfengbingdongguke/article/details/6519351
评:


package fdl;

import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.HttpConnectionParams;
import org.apache.http.params.HttpParams;
import org.apache.http.util.EntityUtils;

public class HttpClientTest {
    public static void main(String[] args) throws Exception {
        // String url =
        // "http://www.google.com.hk/search?hl=zh-CN&source=hp&q=nihao+&aq=f&aqi=&aql=&oq=&gs_rfai=";
        // String url =
        // "http://119.167.216.6/2/10/69/026c47784babc076affa6a90b04c60bb-f4v-h264-aac-180-32-84120.0-2279819-1298991430078-0f91d62cb920c4876dec925922952da5-1-00-00-00.f4v?vid=26210754&lp=8082&lroot=/1&kfd=1&srchost=119.167.213.25&srcroot=/5&s=1&tm=1299139200&key=5586154467d837f7319a91283408157e&lr=0&nlh=0&check=1&diskid=2&id=ku6_vod&usrip=114.246.175.86&uloc=1.1.2&ipsm=1&ext=.f4v";
        String url = "http://www.bhtv.cc:81/**b/2010/02/08/1.flv";
        init(url);

    }

    public static HttpResponse init(String url) throws Exception {
        // 初始化,此处构造函数就与3.1中不同
        HttpClient httpclient = new DefaultHttpClient();

        // HttpHost targetHost = new HttpHost("3g.youku.com");
        HttpGet httpget = new HttpGet(url);
        // HttpGet httpget = new HttpGet("/");

        // 查看默认request头部信息
        System.out.println("Accept-Charset:"
                + httpget.getFirstHeader("Accept-Charset"));
        // 以下这条如果不加会发现无论你设置Accept-Charset为gbk还是utf-8,他都会默认返回gb2312(本例针对google.cn来说)
        httpget.setHeader("User-Agent",
        // "Mozilla/5.0 (SymbianOS/9.1; U; en-us) AppleWebKit/413 (KHTML, like Gecko) Safari/413");
                RquestHeader.IE.valueOf());
        // "Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.6.30 Version/10.70");
        // 用逗号分隔显示可以同时接受多种编码
        httpget.setHeader("Accept-Language", "zh-cn");
        httpget.setHeader("Accept-Encoding", "gzip, deflate");
        httpget.setHeader("Connection", "Keep-Alive");
        httpget
                .setHeader(
                        "Accept",
                        "image/gif, image/jpeg, image/pjpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/QVOD, application/QVOD, */*");
        // httpget.setHeader("Referer",
        // "http://www.bhtv.cc/");

        // Execute HTTP request
        System.out.println("executing request " + httpget.getURI());
        // HttpResponse response = httpclient.execute(targetHost, httpget);
        HttpResponse response = httpclient.execute(httpget);

        System.out.println("----------------------------------------");
        System.out.println("Location: " + response.getLastHeader("Location"));
        System.out.println(response.getStatusLine().getStatusCode());
        System.out.println(response.getLastHeader("Content-Type"));
        System.out.println(response.getLastHeader("Content-Length"));
        System.out.println("----------------------------------------");

        // 判断页面返回状态判断是否进行转向抓取新链接
        int statusCode = response.getStatusLine().getStatusCode();
        System.out.println("statusCode: " + statusCode);

        httpclient = new DefaultHttpClient();
        response = httpclient.execute(httpget);

        HttpEntity entity = response.getEntity();
        File file = new File("d:/my.flv");
        OutputStream os = new FileOutputStream(file);

        byte[] bytes = EntityUtils.toByteArray(entity);
        os.write(bytes, 0, bytes.length);
        // System.out.println(bytes);

        return response;
    }

    public static String baseHttp(HttpResponse response, String url)
            throws Exception {
        long start = System.currentTimeMillis();
        HttpClient httpclient = new DefaultHttpClient();

        // Get hold of the response entity
        HttpEntity entity = response.getEntity();

        // 查看所有返回头部信息
        Header headers[] = response.getAllHeaders();
        int ii = 0;
        while (ii < headers.length) {
            System.out.println(headers[ii].getName() + ": "
                    + headers[ii].getValue());
            ++ii;
        }
        System.out.println("----------------------------------------");

        // If the response does not enclose an entity, there is no need
        // to bother about connection release
        if (entity != null) {
            // 将源码流保存在一个byte数组当中,因为可能需要两次用到该流,
            byte[] bytes = EntityUtils.toByteArray(entity);
            String charSet = "";

            // 如果头部Content-Type中包含了编码信息,那么我们可以直接在此处获取
            charSet = EntityUtils.getContentCharSet(entity);

            System.out.println("In header: " + charSet);
            // 如果头部中没有,那么我们需要 查看页面源码,这个方法虽然不能说完全正确,因为有些粗糙的网页编码者没有在页面中写头部编码信息
            if (charSet == "" || charSet == null) {
                String regEx_html = "(?=<meta).+?(?<=charset=['/"]?)([//w-]+)(?=['/"//s+])";
                String regEx_xml = "(?=<//?xml).+?(?<=encoding=['/"]?)([//w-]+)(?=['/"//s+])";
                Pattern p_html = Pattern.compile(regEx_html,
                        Pattern.CASE_INSENSITIVE);
                Pattern p_xml = Pattern.compile(regEx_xml,
                        Pattern.CASE_INSENSITIVE);
                Matcher m_html = p_html.matcher(new String(bytes)); // 默认编码转成字符串,因为我们的匹配中无中文,所以串中可能的乱码对我们没有影响
                if (m_html.find())
                    charSet = m_html.group(1);
                else {
                    Matcher m_xml = p_xml.matcher(new String(bytes)); // 默认编码转成字符串,因为我们的匹配中无中文,所以串中可能的乱码对我们没有影响
                    if (m_xml.find())
                        charSet = m_xml.group(1);
                    else
                        charSet = "GBK";
                }
            }
            if (charSet.toUpperCase().startsWith("GB")) // 处理汉字编码集过小情况
                charSet = "GBK";
            System.out.println("Last get: " + charSet);
            // 至此,我们可以将原byte数组按照正常编码专成字符串输出(如果找到了编码的话)

            String txt = new String(bytes, charSet);
            // System.out.println("Encoding string is:/n" + txt);
            long end = System.currentTimeMillis();
            System.out.println("Cost time is: " + (end - start) / 1000.00
                    + " s.");
            return txt;
        }

        httpclient.getConnectionManager().shutdown();
        httpclient.getConnectionManager();

        return "";
    }

    public static String simpleHttpClient(String url) throws Exception {
        long start = System.currentTimeMillis();
        // 初始化,此处构造函数就与3.1中不同
        HttpClient httpclient = new DefaultHttpClient();

        // 这里的http.socket.timeout相当于SO_TIMEOUT
        // httpclient.getParams().setIntParameter("http.socket.timeout", 1);
        HttpParams params = httpclient.getParams();
        HttpConnectionParams.setConnectionTimeout(params, 5000);
        HttpConnectionParams.setSoTimeout(params, 10000);

        // HttpHost targetHost = new HttpHost("3g.youku.com");
        HttpGet httpget = new HttpGet(url);
        // HttpGet httpget = new HttpGet("/");

        // 查看默认request头部信息
        System.out.println("Accept-Charset:"
                + httpget.getFirstHeader("Accept-Charset"));
        // 以下这条如果不加会发现无论你设置Accept-Charset为gbk还是utf-8,他都会默认返回gb2312(本例针对google.cn来说)
        httpget.setHeader("User-Agent",
        // "Mozilla/5.0 (SymbianOS/9.1; U; en-us) AppleWebKit/413 (KHTML, like Gecko) Safari/413");
                RquestHeader.FIREFOX.valueOf());
        // "Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.6.30 Version/10.70");
        // 用逗号分隔显示可以同时接受多种编码
        httpget.setHeader("Accept-Language", "zh-cn,zh;q=0.5");
        httpget.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7");
        httpget
                .setHeader("Accept",
                        "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
        httpget.setHeader("Accept-Encoding", "gzip,deflate");
        httpget.setHeader("Keep-Alive", "115");
        httpget.setHeader("Connection", "keep-alive");
        // 验证头部信息设置生效
        System.out.println("Accept-Charset:"
                + httpget.getFirstHeader("Accept-Charset").getValue());

        // Execute HTTP request
        System.out.println("executing request " + httpget.getURI());
        // HttpResponse response = httpclient.execute(targetHost, httpget);
        HttpResponse response = httpclient.execute(httpget);

        response.setEntity(null);

        System.out.println("----------------------------------------");
        System.out.println("Location: " + response.getLastHeader("Location"));
        System.out.println(response.getStatusLine().getStatusCode());
        System.out.println(response.getLastHeader("Content-Type"));
        System.out.println(response.getLastHeader("Content-Length"));
        System.out.println("----------------------------------------");

        // 判断页面返回状态判断是否进行转向抓取新链接
        int statusCode = response.getStatusLine().getStatusCode();
        System.out.println("statusCode: " + statusCode);
        if ((statusCode == HttpStatus.SC_MOVED_PERMANENTLY)
                || (statusCode == HttpStatus.SC_MOVED_TEMPORARILY)
                || (statusCode == HttpStatus.SC_SEE_OTHER)
                || (statusCode == HttpStatus.SC_TEMPORARY_REDIRECT)) {
            // 此处重定向处理 此处还未验证
            String newUri = response.getLastHeader("Location").getValue();
            System.out.println("newUri: ".concat(newUri));
            httpclient = new DefaultHttpClient();
            httpget = new HttpGet(newUri);
            response = httpclient.execute(httpget);
        }

        // Get hold of the response entity
        HttpEntity entity = response.getEntity();

        // 查看所有返回头部信息
        Header headers[] = response.getAllHeaders();
        int ii = 0;
        while (ii < headers.length) {
            System.out.println(headers[ii].getName() + ": "
                    + headers[ii].getValue());
            ++ii;
        }
        System.out.println("----------------------------------------");

        // If the response does not enclose an entity, there is no need
        // to bother about connection release
        if (entity != null) {
            // 将源码流保存在一个byte数组当中,因为可能需要两次用到该流,
            byte[] bytes = EntityUtils.toByteArray(entity);

            {
                //
                File file = new File("c:/a.flv");
                OutputStream os = new FileOutputStream(file);
                os.write(bytes);
                os.flush();
                os.close();
            }

            String charSet = "";

            // 如果头部Content-Type中包含了编码信息,那么我们可以直接在此处获取
            charSet = EntityUtils.getContentCharSet(entity);

            System.out.println("In header: " + charSet);
            // 如果头部中没有,那么我们需要 查看页面源码,这个方法虽然不能说完全正确,因为有些粗糙的网页编码者没有在页面中写头部编码信息
            if (charSet == "" || charSet == null) {
                String regEx_html = "(?=<meta).+?(?<=charset=['/"]?)([//w-]+)(?=['/"//s+])";
                String regEx_xml = "(?=<//?xml).+?(?<=encoding=['/"]?)([//w-]+)(?=['/"//s+])";
                Pattern p_html = Pattern.compile(regEx_html,
                        Pattern.CASE_INSENSITIVE);
                Pattern p_xml = Pattern.compile(regEx_xml,
                        Pattern.CASE_INSENSITIVE);
                Matcher m_html = p_html.matcher(new String(bytes)); // 默认编码转成字符串,因为我们的匹配中无中文,所以串中可能的乱码对我们没有影响
                if (m_html.find())
                    charSet = m_html.group(1);
                else {
                    Matcher m_xml = p_xml.matcher(new String(bytes)); // 默认编码转成字符串,因为我们的匹配中无中文,所以串中可能的乱码对我们没有影响
                    if (m_xml.find())
                        charSet = m_xml.group(1);
                    else
                        charSet = "GBK";
                }
            }
            if (charSet.toUpperCase().startsWith("GB")) // 处理汉字编码集过小情况
                charSet = "GBK";
            System.out.println("Last get: " + charSet);
            // 至此,我们可以将原byte数组按照正常编码专成字符串输出(如果找到了编码的话)

            String txt = new String(bytes, charSet);
            // System.out.println("Encoding string is:/n" + txt);
            long end = System.currentTimeMillis();
            System.out.println("Cost time is: " + (end - start) / 1000.00
                    + " s.");
            return txt;
        }

        httpclient.getConnectionManager().shutdown();

        return "";
    }

    public static void test1(String url) throws Exception {
        // (?<=<a).+?href=["']?(.+?)(?=["' >])

        Pattern p = Pattern.compile("<a//s+?href=[/"']?(.*?)[/"'//s >]",
                Pattern.CASE_INSENSITIVE);

        String txt = simpleHttpClient(url);
        long b = System.nanoTime();

        Matcher m = p.matcher(txt);
        Pattern pt = Pattern.compile("^((javascript|mailto):.*)|([#/])$",
                Pattern.CASE_INSENSITIVE);
        while (m.find()) {
            // System.out.println(m.group(1));
            String rs = m.group(1);

            Matcher mt = pt.matcher(rs);
            if (!mt.matches())
                System.out.println("--/t"
                        .concat(isRelativeAddressToFullUrlAddressNew(rs, url)));

        }
        long e = System.nanoTime() - b;
        System.out.println("Cost time is: " + (e / 1000000000.00) + " s.");

    }

    /**
     *
     * 处理带"#"情况的URL
     *
     * @param link
     *            需要转换的链接
     * @param base
     *            页面的url
     * @return 绝对 URL.
     */
    public static String isRelativeAddressToFullUrlAddressNew(String link,
            String base) {

        String dealLinkStr = link.trim();
        URL url = null;
        try {
            if (dealLinkStr.startsWith("#"))
                url = new URL(base + dealLinkStr);
            else
                url = isRelativeAddressToFullUrlAddress(link, base);
            return url.toString();
        } catch (MalformedURLException e) {
            // e.printStackTrace();
            // logger.debug("连接不合法..... <url> " + link);
            return link;
        }
    }

    /**
     * Build a URL from the link and base provided.
     *
     * @param link
     *            需要转换的链接
     * @param base
     *            页面的url
     * @return 绝对 URL.
     * @exception MalformedURLException
     *                If creating the URL fails.
     */
    public static URL isRelativeAddressToFullUrlAddress(String link, String base)
            throws MalformedURLException {
        String path;
        boolean modified;
        boolean absolute;
        boolean strict = haveQM(base);
        int index;
        URL url; // constructed URL combining relative link and base

        // Bug #1461473 Relative links starting with ?
        if (!strict && ('?' == link.charAt(0))) { // remove query part of base
            // if any
            if (-1 != (index = base.lastIndexOf('?')))
                base = base.substring(0, index);
            url = new URL(base + link);
        } else {
            url = new URL(new URL(base), link);
        }
        path = url.getFile();
        modified = false;
        absolute = link.startsWith("/");
        if (!absolute) { // we prefer to fix incorrect relative links
            // this doesn't fix them all, just the ones at the start
            while (path.startsWith("/.")) {
                if (path.startsWith("/../")) {
                    path = path.substring(3);
                    modified = true;
                } else if (path.startsWith("/./") || path.startsWith("/.")) {
                    path = path.substring(2);
                    modified = true;
                } else
                    break;
            }
        }
        // fix backslashes
        while (-1 != (index = path.indexOf("///"))) {
            path = path.substring(0, index + 1) + path.substring(index + 2);
            modified = true;
        }
        if (modified)
            url = new URL(url, path);

        return (url);
    }

    private static boolean haveQM(String urlStr) {
        if (urlStr.contains("?")) {
            return true;
        } else {
            return false;
        }
    }
}

/**
*
*各种请求头类型
*
* @author Jerome
*
*/
enum RquestHeader {
    Oper {
        @Override
        public String valueOf() {
            return "Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.6.30 Version/10.70";
        }
    },

    IPhone {

        @Override
        public String valueOf() {
            return "Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16";
        }

    },

    IE {

        @Override
        public String valueOf() {
            return "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; InfoPath.2; .NET CLR 2.0.50727)";
        }

    },

    FIREFOX {

        @Override
        public String valueOf() {
            return "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13";
        }

    },

    IE8 {

        @Override
        public String valueOf() {
            return "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/4.0)";
        }

    };
    // 获取请求头内容
    public abstract String valueOf();
}
将注释

// httpget.setHeader("Referer",
        // "http://www.bhtv.cc/");

打开就可以了
分享到:
评论

相关推荐

    http远程接口调用-httpClient+跳过SSL证书校验

    在IT行业中,网络通信是应用程序之间交互的重要方式。...在apitest文件中,你可以找到包含HttpClient工具类代码,包括专门用于跳过SSL证书校验的工具类,这些代码可以作为参考,帮助你在实际项目中实现类似功能。

    Java爬虫小例子,爬取小网站,突破防盗链下载图片

    - **防盗链处理**:常见的防盗链策略有检查Referer字段,可以伪造Referer或使用代理IP绕过限制。 - **缓存机制**:为了节省网络带宽和提高下载速度,可以使用本地缓存,避免重复下载。 4. **文件操作**: - **...

    java,HttpClient模拟上传,绕过SSL认证

    接下来,我们需要创建一个HttpClient实例,同时配置它跳过SSL认证。这可以通过自定义`SSLContext`和`TrustStrategy`实现: ```java import javax.net.ssl.SSLContext; import javax.net.ssl.TrustManager; import ...

    HttpClient配置SSL绕过https证书实例

    HttpClient配置SSL绕过https证书实例,附件中包含所需httpclient组件jar库。博客地址:http://blog.csdn.net/irokay/article/details/78801307。

    使用httpclient解决跨域问题

    我使用的是httpClient 进行内部转发 我们在A的服务器上,将前台的文件流,通过httpClient传输到B的服务器上(B的服务器通过控制层接受A传输的文件流,让后保存在B的服务器上。返回一个json结果)

    HttpClient4.5 实现https忽略SSL证书验证

    使用HttpClient4.5实现https请求忽略SSL证书验证工具类

    基于httpclient的文件可配置的心跳检测应用

    【标题】"基于httpclient的文件可配置的心跳检测应用"是关于利用Apache HttpClient库进行网络连接健康检查和文件变化监控的技术实现。该应用适用于分布式系统中,确保服务间的通信可靠性,同时也关注本地或远程文件...

    httpclient4.5 绕过ssl认证文件访问

    4. **执行HTTP请求**:现在,你可以使用这个HttpClient实例执行HTTPS请求,它将跳过正常的SSL验证过程。 ```java import org.apache.http.HttpResponse; import org.apache.http.client.methods.HttpGet; import ...

    httpclient 绕开HTTPS证书校验

    但是,我们可以通过创建一个信任所有证书的`TrustManager`实例,然后将其设置到`SSLContext`中,从而跳过证书验证。 以下是一个简单的示例,展示了如何使用`httpclient`实现这一功能: ```java import org.apache....

    彻底解决httpClient乱码问题

    本文将深入探讨如何使用HttpClient来彻底解决乱码问题。 HttpClient是一个功能强大的HTTP客户端,支持多种HTTP协议版本,包括GET、POST等请求方法,以及重试、连接管理等功能。在处理中文字符时,由于编码不一致...

    HttpClient、乱码解决:实例

    本篇将详细介绍如何使用HttpClient解决乱码问题。 一、HttpClient基本使用 HttpClient主要由以下几个核心组件构成: 1. HttpClient:客户端实例,负责管理连接、配置请求等。 2. HttpRequestBase:表示HTTP请求,如...

    httpclient.jar包下载

    《深入解析httpclient.jar及其与code.jar的关联》 在Java开发中,HTTP通信是不可或缺的一部分,而Apache HttpClient库正是Java实现HTTP客户端操作的重要工具。本文将深入探讨httpclient.jar包,以及它与code.jar包...

    httpclient httpclient.jar

    在本文中,我们将深入探讨HttpClient的核心概念、使用方法以及如何通过`httpclient.jar`进行实战应用。 首先,HttpClient的主要组件包括: 1. **HttpClient实例**:这是整个HTTP通信的核心,负责管理连接、请求和...

    HttpClientHelper 工具类

    综上所述,HttpClientHelper 是一个实用的HTTP客户端工具类,结合了HttpClient的强大功能和单例模式的高效管理,为C#开发者提供了便捷的网络请求解决方案,特别适合于爬虫开发和需要频繁进行HTTP通信的项目。

    HttpClient 3.x to HttpComponents HttpClient 4.x

    例如,在HttpClient 3.x中,代码可能会使用`***mons.httpclient.HttpClient`类和`***mons.httpclient.methods.GetMethod`等,而在4.x版本中,这些都被新的API所替代。程序员需要熟悉`org.apache....

    httpclient

    创建HttpClient实例是使用HttpClient的第一步。通常,我们会创建一个`CloseableHttpClient`对象,这可以通过`HttpClientBuilder`或者`HttpAsyncClientBuilder`来实现。例如: ```java CloseableHttpClient ...

    httpClient

    HttpClient httpClient = new HttpClient(); // 设置 Http 连接超时为5秒 httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(5000); /* 2 生成 GetMethod 对象并设置参数 */ GetMethod ...

    httpClient 解决 https

    https 的支持单向认证 支持多线程 支持get、post

Global site tag (gtag.js) - Google Analytics