- 浏览: 17839 次
- 性别:
- 来自: 长沙
最新评论
最近公司闲来无事,看到了Apache nutch项目,记得前段时间做了网上数据的抓取,是别人给的代码,自己改动一下代码,然后实现其功能。当初没有深究,所以现研究了一下。
从网上看了很多的例子,实现网络爬虫,大概三步走:一是加载所要爬虫的网站。二是爬虫链接。三是爬虫匹配的内容。以下是原始的实现方法,代码:
package com.shangkang.pzf.xywy;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.CoreConnectionPNames;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import com.shangkang.yjw.manager.LinkQueue;
import com.shangkang.yjw.manager.Queue;
import com.shangkang.yjw.util.Constant;
public class GetStartPoint {
public static void main(String[] args) {
String baseUrl = "http://club.xywy.com/";
new GetStartPoint().downloadFile(baseUrl,"xywy");
String filePath = "d:/crawler-cust/xywy.html";
testParserHtml2NeedLink(filePath);
//加载所要爬虫的网站
public void downloadFile(String url,String fileName){
String saveFilePath = "d:/crawler-cust/";
HttpClient hc = null;
try {
hc = new DefaultHttpClient(); hc.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, 5000);
HttpGet httpGet = new HttpGet(url);
HttpResponse response = hc.execute(httpGet);
response.getParams();
HttpEntity entity = response.getEntity();
System.out.println(entity.getContentType());
if(entity != null)
{
InputStream is = entity.getContent();
FileUtils.copyInputStreamToFile(is, new File(saveFilePath + fileName + ".html"));
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IllegalStateException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally{
hc.getConnectionManager().shutdown();
}
}
//爬虫链接
public static void testParserHtml2NeedLink(String filePath)
{
try {
Parser parser = new Parser(filePath);
NodeList nodeList = parser.extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node) {
if(node.getText().startsWith("dl class=\"clearfix\""))
{ System.out.println("node.getText()"+node.getText());//class="clearfix" <dl class="clearfix">
return true;
}else
{
return false;
}
}
});
NodeList nodeListA = new NodeList();
NodeList nodeListDd = new NodeList();
if(nodeList != null)
{
int size = nodeList.size();
for(int i = 0 ; i < size ; i ++)
{
Node dlNode = nodeList.elementAt(i);
nodeListDd = dlNode.getChildren();
nodeListA.add(nodeListDd.extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node) {
if(node.getText().startsWith("a target=\"_blank\" href="))
{ System.out.println(node.getText());
return true;
}
return false;
}
},true));
}
}
System.out.println("-------------------------------");
int size = nodeListA.size();
for(int i = 0; i< size ; i++)
{
// nodeListA.
Node node = nodeListA.elementAt(i);
if(node instanceof LinkTag)
{
String link = ((LinkTag)node).extractLink();
// System.out.println("link == " + link.replace("file://localhost", base_url_yp900));
link = link.replace("file://localhost", "");
System.out.println(link);
link = Constant.BASE_URL_XYWY+link; LinkQueue.addUnvisitedUrl(link); LinkQueue.addUnvisitedUrlName(new String(node.toPlainTextString().getBytes("ISO-8859-1"),"GBK"));
}
// System.out.println(node);
}
File file = new File(Constant.SAVE_FILE_DIR + "xywy_need_links.txt");
File fileName = new File(Constant.SAVE_FILE_DIR + "xywy_need_links_TypeName.txt");
// Queue<String> ulrNames = LinkQueue.getUnVisitedUrlQueue();
Queue<String> ulrs = LinkQueue.getUnVisitedUrlQueue();
while(!ulrs.isEmpty())
{
String url = ulrs.deQueue();
// String urlName = ulrNames.deQueue();
// FileUtils.writeStringToFile(fileName, urlName+"\r\n", true);
FileUtils.writeStringToFile(file, url+"\r\n", true);
}
} catch (ParserException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
//爬虫二级连接
/**
* COPYRIGHT (C) 2010 LY. ALL RIGHTS RESERVED.
*
* No part of this publication may be reproduced, stored in a retrieval system,
* or transmitted, on any form or by any means, electronic, mechanical, photocopying,
* recording, or otherwise, without the prior written permission of 3KW.
*
* Created By: zzqiang
* Created On: 2013-6-18
*
* Amendment History:
*
* Amended By Amended On Amendment Description
* ------------ ----------- ---------------------------------------------
*
**/
package com.shangkang.pzf.xywy;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.StatusLine;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.CoreConnectionPNames;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import com.shangkang.yjw.manager.LinkQueue;
import com.shangkang.yjw.util.Constant;
public class GetValuedLink {
public static void main(String[] args) throws IOException
{
List<String> urls = new ArrayList<String>();
//获取
urls =FileUtils.readLines(new File(Constant.SAVE_FILE_DIR + "xywy_need_links.txt"));
for (String url : urls)
{
String startPoint = url;
System.out.println(startPoint);
LinkQueue.addUnvisitedUrl(startPoint);
}
while (!LinkQueue.getUnVisitedUrlQueue().isEmpty())
{
String url = LinkQueue.getUnVisitedUrlQueue().deQueue();
System.out.println("---------------------正在处理Url----------------===" + url);
if(!LinkQueue.getVisitedUrl().contains(url))
{
downloadFileAndParserLink(url);
LinkQueue.addVisitedUrl(url);
}
}
String filePath = Constant.SAVE_FILE_DIR + "valued_link_" + Constant.WWWXYWYCOM + ".txt";
LinkQueue.flushContent2File(LinkQueue.getValuedUrls(), filePath);
}
public static void downloadFileAndParserLink(String startPoint)
{
String accessUrl = startPoint;
//http://www.yp900.com/ZY-HXXT/index_2.htm
//http://www.yp900.com/ZY-HXXT/
String urlEnd = startPoint.substring(startPoint.lastIndexOf("/")+1);
int lastPoint = startPoint.lastIndexOf("/");
int lastLastPoint = startPoint.substring(0, lastPoint).lastIndexOf("/");
String sonDir = startPoint.substring(lastLastPoint+1, lastPoint);
startPoint = startPoint.replace(urlEnd, "");
String fileName = urlEnd.equals("") ? sonDir : urlEnd.substring(0, urlEnd.lastIndexOf("."));
HttpClient hc = null;
String filePath = null;
try {
hc = new DefaultHttpClient();
hc.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, 8000);
HttpGet httpGet = new HttpGet(accessUrl);
HttpResponse response = hc.execute(httpGet);
response.getParams();
StatusLine statusLine = response.getStatusLine();
if(statusLine.getStatusCode() == 200)
{
HttpEntity entity = response.getEntity();
// System.out.println(entity.getContentType());
if(entity != null)
{
InputStream is = entity.getContent();
filePath = Constant.SAVE_FILE_DIR+ Constant.WWWXYWYCOM +"/" + sonDir +"/"+fileName + ".htm";
System.out.println("save file Path = " + filePath);
FileUtils.copyInputStreamToFile(is, new File(filePath));
System.out.println("file down load succuss: source url =" + startPoint);
}
}else if(statusLine.getStatusCode() == 404)
{
System.err.println("http 404 :::" + startPoint);;
}
else
{
System.err.println("http connect error");
}
if(null != filePath)
{
parserValuedLinkAndNextLink(filePath,startPoint);
System.out.println("-- 删除下载的文件 --" + filePath);
new File(filePath).delete();
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IllegalStateException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally{
hc.getConnectionManager().shutdown();
}
}
public static void parserValuedLinkAndNextLink(String filePath,String startPoint)
{
// div class="r_btn f_r"
try
{
Parser parser = new Parser(filePath);
NodeList nodeListDiv = parser.extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node)
{
// System.out.println(node);
if (node.getText().startsWith(
"td class=\"pl20 w340\""))
{
//class="clearfix" <dl class="clearfix">
return true;
} else
{
return false;
}
}
});
NodeList nodeListA = new NodeList();
NodeList nodeListDd = new NodeList();
if(nodeListDiv != null)
{
int size = nodeListDiv.size();
for(int i = 0 ; i < size ; i ++)
{
Node divNode = nodeListDiv.elementAt(i);
NodeList nodes = divNode.getChildren();
nodeListA.add(nodes.extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node)
{
if(node instanceof LinkTag)
{
return true;
}
else
{
return false;
}
}
}, true));
}
}
System.out.println("-------抽取有价值的连接---start----");
int size = nodeListA.size();
for(int i = 0; i< size ; i++)
{
Node node = nodeListA.elementAt(i);
if(node instanceof LinkTag)
{
String link = ((LinkTag)node).extractLink();
// link = link.replace("file://localhost", "");
// System.out.println(link);
if(link.indexOf("static") != -1)
{
// link = Constant.BASE_URL_XYWY + link;
// link = link.replace("file://localhost", "");
System.out.println("valued link =" + link);
LinkQueue.addValuedUrl(link,Constant.WWWXYWYCOM);
}
}
}
System.out.println("-------抽取有价值的连接---end---");
System.out.println("-------抽取Next下载的连接- start------");
NodeList nextNodeList = new NodeList();
parser = new Parser(filePath);
NodeList pageNumNodeList = parser.extractAllNodesThatMatch(new NodeFilter(){
@Override
public boolean accept(Node node)
{
if(node.getText().startsWith("div class=\"clearfix pageStyle tc mt20 pb20 f12 pagelink\""))
{
return true;
}else
{
return false;
}
}
});
int divSize = pageNumNodeList.size();
String nextLink = null;
for(int i = 0; i< divSize; i++)
{
Node divNode = pageNumNodeList.elementAt(i);
nextNodeList = divNode.getChildren().extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node)
{
if(node.getText().startsWith("a href=") && node instanceof LinkTag)
{
LinkTag linkTag = (LinkTag)node;
String link = linkTag.extractLink();
String linkText = linkTag.getLinkText();
// System.out.println("linkText =" + linkText);
if(linkText.contains("下一页") && link != null && !link.equals(""))
{
return true;
}
}
return false;
}
}, true);
}
if(null != nextNodeList && nextNodeList.size() > 0)
{
Node node = nextNodeList.elementAt(0);
if(node instanceof LinkTag)
{
LinkTag linkTag = (LinkTag)node;
nextLink = linkTag.extractLink();
System.out.println("nextLink ==" + nextLink);
nextLink = Constant.BASE_URL_XYWY + nextLink;
System.out.println("找到新的下载链接:" + nextLink);
String fileName = nextLink.substring(nextLink.lastIndexOf("/"));
System.out.println("fileName ====" + fileName);
LinkQueue.addUnvisitedUrl(nextLink);
}
}
System.out.println("-------抽取Next下载的连接---end----");
} catch (Exception e)
{
e.printStackTrace();
}
}
}
从网上看了很多的例子,实现网络爬虫,大概三步走:一是加载所要爬虫的网站。二是爬虫链接。三是爬虫匹配的内容。以下是原始的实现方法,代码:
package com.shangkang.pzf.xywy;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.CoreConnectionPNames;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import com.shangkang.yjw.manager.LinkQueue;
import com.shangkang.yjw.manager.Queue;
import com.shangkang.yjw.util.Constant;
public class GetStartPoint {
public static void main(String[] args) {
String baseUrl = "http://club.xywy.com/";
new GetStartPoint().downloadFile(baseUrl,"xywy");
String filePath = "d:/crawler-cust/xywy.html";
testParserHtml2NeedLink(filePath);
//加载所要爬虫的网站
public void downloadFile(String url,String fileName){
String saveFilePath = "d:/crawler-cust/";
HttpClient hc = null;
try {
hc = new DefaultHttpClient(); hc.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, 5000);
HttpGet httpGet = new HttpGet(url);
HttpResponse response = hc.execute(httpGet);
response.getParams();
HttpEntity entity = response.getEntity();
System.out.println(entity.getContentType());
if(entity != null)
{
InputStream is = entity.getContent();
FileUtils.copyInputStreamToFile(is, new File(saveFilePath + fileName + ".html"));
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IllegalStateException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally{
hc.getConnectionManager().shutdown();
}
}
//爬虫链接
public static void testParserHtml2NeedLink(String filePath)
{
try {
Parser parser = new Parser(filePath);
NodeList nodeList = parser.extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node) {
if(node.getText().startsWith("dl class=\"clearfix\""))
{ System.out.println("node.getText()"+node.getText());//class="clearfix" <dl class="clearfix">
return true;
}else
{
return false;
}
}
});
NodeList nodeListA = new NodeList();
NodeList nodeListDd = new NodeList();
if(nodeList != null)
{
int size = nodeList.size();
for(int i = 0 ; i < size ; i ++)
{
Node dlNode = nodeList.elementAt(i);
nodeListDd = dlNode.getChildren();
nodeListA.add(nodeListDd.extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node) {
if(node.getText().startsWith("a target=\"_blank\" href="))
{ System.out.println(node.getText());
return true;
}
return false;
}
},true));
}
}
System.out.println("-------------------------------");
int size = nodeListA.size();
for(int i = 0; i< size ; i++)
{
// nodeListA.
Node node = nodeListA.elementAt(i);
if(node instanceof LinkTag)
{
String link = ((LinkTag)node).extractLink();
// System.out.println("link == " + link.replace("file://localhost", base_url_yp900));
link = link.replace("file://localhost", "");
System.out.println(link);
link = Constant.BASE_URL_XYWY+link; LinkQueue.addUnvisitedUrl(link); LinkQueue.addUnvisitedUrlName(new String(node.toPlainTextString().getBytes("ISO-8859-1"),"GBK"));
}
// System.out.println(node);
}
File file = new File(Constant.SAVE_FILE_DIR + "xywy_need_links.txt");
File fileName = new File(Constant.SAVE_FILE_DIR + "xywy_need_links_TypeName.txt");
// Queue<String> ulrNames = LinkQueue.getUnVisitedUrlQueue();
Queue<String> ulrs = LinkQueue.getUnVisitedUrlQueue();
while(!ulrs.isEmpty())
{
String url = ulrs.deQueue();
// String urlName = ulrNames.deQueue();
// FileUtils.writeStringToFile(fileName, urlName+"\r\n", true);
FileUtils.writeStringToFile(file, url+"\r\n", true);
}
} catch (ParserException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
//爬虫二级连接
/**
* COPYRIGHT (C) 2010 LY. ALL RIGHTS RESERVED.
*
* No part of this publication may be reproduced, stored in a retrieval system,
* or transmitted, on any form or by any means, electronic, mechanical, photocopying,
* recording, or otherwise, without the prior written permission of 3KW.
*
* Created By: zzqiang
* Created On: 2013-6-18
*
* Amendment History:
*
* Amended By Amended On Amendment Description
* ------------ ----------- ---------------------------------------------
*
**/
package com.shangkang.pzf.xywy;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.StatusLine;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.CoreConnectionPNames;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import com.shangkang.yjw.manager.LinkQueue;
import com.shangkang.yjw.util.Constant;
public class GetValuedLink {
public static void main(String[] args) throws IOException
{
List<String> urls = new ArrayList<String>();
//获取
urls =FileUtils.readLines(new File(Constant.SAVE_FILE_DIR + "xywy_need_links.txt"));
for (String url : urls)
{
String startPoint = url;
System.out.println(startPoint);
LinkQueue.addUnvisitedUrl(startPoint);
}
while (!LinkQueue.getUnVisitedUrlQueue().isEmpty())
{
String url = LinkQueue.getUnVisitedUrlQueue().deQueue();
System.out.println("---------------------正在处理Url----------------===" + url);
if(!LinkQueue.getVisitedUrl().contains(url))
{
downloadFileAndParserLink(url);
LinkQueue.addVisitedUrl(url);
}
}
String filePath = Constant.SAVE_FILE_DIR + "valued_link_" + Constant.WWWXYWYCOM + ".txt";
LinkQueue.flushContent2File(LinkQueue.getValuedUrls(), filePath);
}
public static void downloadFileAndParserLink(String startPoint)
{
String accessUrl = startPoint;
//http://www.yp900.com/ZY-HXXT/index_2.htm
//http://www.yp900.com/ZY-HXXT/
String urlEnd = startPoint.substring(startPoint.lastIndexOf("/")+1);
int lastPoint = startPoint.lastIndexOf("/");
int lastLastPoint = startPoint.substring(0, lastPoint).lastIndexOf("/");
String sonDir = startPoint.substring(lastLastPoint+1, lastPoint);
startPoint = startPoint.replace(urlEnd, "");
String fileName = urlEnd.equals("") ? sonDir : urlEnd.substring(0, urlEnd.lastIndexOf("."));
HttpClient hc = null;
String filePath = null;
try {
hc = new DefaultHttpClient();
hc.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, 8000);
HttpGet httpGet = new HttpGet(accessUrl);
HttpResponse response = hc.execute(httpGet);
response.getParams();
StatusLine statusLine = response.getStatusLine();
if(statusLine.getStatusCode() == 200)
{
HttpEntity entity = response.getEntity();
// System.out.println(entity.getContentType());
if(entity != null)
{
InputStream is = entity.getContent();
filePath = Constant.SAVE_FILE_DIR+ Constant.WWWXYWYCOM +"/" + sonDir +"/"+fileName + ".htm";
System.out.println("save file Path = " + filePath);
FileUtils.copyInputStreamToFile(is, new File(filePath));
System.out.println("file down load succuss: source url =" + startPoint);
}
}else if(statusLine.getStatusCode() == 404)
{
System.err.println("http 404 :::" + startPoint);;
}
else
{
System.err.println("http connect error");
}
if(null != filePath)
{
parserValuedLinkAndNextLink(filePath,startPoint);
System.out.println("-- 删除下载的文件 --" + filePath);
new File(filePath).delete();
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IllegalStateException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally{
hc.getConnectionManager().shutdown();
}
}
public static void parserValuedLinkAndNextLink(String filePath,String startPoint)
{
// div class="r_btn f_r"
try
{
Parser parser = new Parser(filePath);
NodeList nodeListDiv = parser.extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node)
{
// System.out.println(node);
if (node.getText().startsWith(
"td class=\"pl20 w340\""))
{
//class="clearfix" <dl class="clearfix">
return true;
} else
{
return false;
}
}
});
NodeList nodeListA = new NodeList();
NodeList nodeListDd = new NodeList();
if(nodeListDiv != null)
{
int size = nodeListDiv.size();
for(int i = 0 ; i < size ; i ++)
{
Node divNode = nodeListDiv.elementAt(i);
NodeList nodes = divNode.getChildren();
nodeListA.add(nodes.extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node)
{
if(node instanceof LinkTag)
{
return true;
}
else
{
return false;
}
}
}, true));
}
}
System.out.println("-------抽取有价值的连接---start----");
int size = nodeListA.size();
for(int i = 0; i< size ; i++)
{
Node node = nodeListA.elementAt(i);
if(node instanceof LinkTag)
{
String link = ((LinkTag)node).extractLink();
// link = link.replace("file://localhost", "");
// System.out.println(link);
if(link.indexOf("static") != -1)
{
// link = Constant.BASE_URL_XYWY + link;
// link = link.replace("file://localhost", "");
System.out.println("valued link =" + link);
LinkQueue.addValuedUrl(link,Constant.WWWXYWYCOM);
}
}
}
System.out.println("-------抽取有价值的连接---end---");
System.out.println("-------抽取Next下载的连接- start------");
NodeList nextNodeList = new NodeList();
parser = new Parser(filePath);
NodeList pageNumNodeList = parser.extractAllNodesThatMatch(new NodeFilter(){
@Override
public boolean accept(Node node)
{
if(node.getText().startsWith("div class=\"clearfix pageStyle tc mt20 pb20 f12 pagelink\""))
{
return true;
}else
{
return false;
}
}
});
int divSize = pageNumNodeList.size();
String nextLink = null;
for(int i = 0; i< divSize; i++)
{
Node divNode = pageNumNodeList.elementAt(i);
nextNodeList = divNode.getChildren().extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node)
{
if(node.getText().startsWith("a href=") && node instanceof LinkTag)
{
LinkTag linkTag = (LinkTag)node;
String link = linkTag.extractLink();
String linkText = linkTag.getLinkText();
// System.out.println("linkText =" + linkText);
if(linkText.contains("下一页") && link != null && !link.equals(""))
{
return true;
}
}
return false;
}
}, true);
}
if(null != nextNodeList && nextNodeList.size() > 0)
{
Node node = nextNodeList.elementAt(0);
if(node instanceof LinkTag)
{
LinkTag linkTag = (LinkTag)node;
nextLink = linkTag.extractLink();
System.out.println("nextLink ==" + nextLink);
nextLink = Constant.BASE_URL_XYWY + nextLink;
System.out.println("找到新的下载链接:" + nextLink);
String fileName = nextLink.substring(nextLink.lastIndexOf("/"));
System.out.println("fileName ====" + fileName);
LinkQueue.addUnvisitedUrl(nextLink);
}
}
System.out.println("-------抽取Next下载的连接---end----");
} catch (Exception e)
{
e.printStackTrace();
}
}
}
发表评论
-
Java开发中遇到的小问题总结
2014-12-11 17:49 626昨天有人在群里问说JSP页面发送请求后获取参数属性值乱码,re ... -
Java实现判断电话号码运行商
2014-12-04 12:05 1801项目需求描述: 后台统计电话号码发送短信统计,要求选择运行商选 ... -
Java编程之代码优化
2014-12-02 15:47 689可供程序利用的资源(内存、CPU时间、网络带宽等)是有限的,优 ... -
JAVA的Random类
2014-11-27 17:28 379Random类 (java.util) ... -
Java开发ui工具对比
2014-11-26 09:47 457最近由于项目需要,对js UI作了一些简单的了解和使用,有自己 ... -
汉字按拼音首字母查询
2014-11-17 15:16 1590按汉字首字母搜索的功能,即:输入“bj”得到“北京”; 注 ... -
网上支付
2014-11-15 09:46 434关于Alipay支付宝接口(Java版) 2013-03 ...
相关推荐
这篇毕业设计的标题是“基于Java的主题网络爬虫设计与实现”,它涵盖了数据库设计、程序开发以及论文撰写等多个方面。 首先,我们来看数据库部分。`article.sql`可能包含了项目的数据库结构,这通常会包括表的设计...
网络爬虫代码java实现 网络爬虫代码java实现 网络爬虫代码java实现
在Java中实现网络爬虫涉及多个关键知识点,对于初学者和希望深化Java基础的开发者来说,这是一个非常实用的学习项目。以下将详细阐述这些知识点: 1. **Java基础知识**:首先,你需要熟悉Java编程语言的基本语法、...
Java简单网络爬虫是一种利用编程技术自动从互联网上抓取信息的程序。在这个项目中,我们专注于使用Java语言来创建一个基本的网络爬虫,它能够访问智联招聘网站,并提取出职位名称、公司名称、工作地点以及薪资等关键...
"JAVA基于网络爬虫的搜索引擎设计与实现" 本文档主要讨论了基于Java的网络爬虫搜索引擎的设计和实现。以下是从该文档中提炼出的相关知识点: 一、搜索引擎概述 * 搜索引擎是指通过网络爬虫或蜘蛛来收集、处理和...
【标题】"基于JAVA的网络爬虫程序源代码"是一个涉及网络爬虫技术的Java编程项目,它提供了实现网络爬虫功能的完整源代码。网络爬虫是互联网上的自动化工具,用于系统地浏览、抓取网页信息并存储到本地数据库或文件中...
用Java实现抓取网页,支持下载网页和网页上面的图片内容,修改不同的源地址可以下载不同地址的网页
### 基于Java的多线程网络爬虫设计与实现 #### 概述 本文档探讨了在Java环境下设计与实现多线程网络爬虫的技术细节与实践方法。网络爬虫(Web Crawler),是一种自动抓取互联网上网页信息的程序或自动化脚本,其...
【简易Java网络爬虫】是一种基于Java编程语言实现的简单网络数据抓取工具,主要用于从趣配音的web页面上获取信息。在这个项目中,我们将会探讨如何构建一个基础的网络爬虫,涉及到的关键技术包括HTTP请求、HTML解析...
在本项目中,我们将探讨如何利用Java实现一个简单的搜索引擎,主要涉及的技术点包括网络请求、HTML解析、数据存储以及索引构建。 首先,`heritrix`这个名字暗示了我们可能在使用Heritrix,这是一个开源的、高度可...
在实现网络爬虫时,我们可以使用`Future`来管理每个网页的抓取任务。例如,创建一个`ExecutorService`实例,提交任务(每个任务对应一个网页的抓取),然后利用`Future`对象来监控任务状态,等待结果或者取消任务。 ...
在这个“利用Java实现的网络爬虫”项目中,我们可以深入理解如何构建一个有效的网络数据抓取系统。以下是这个项目中涉及的一些关键知识点: 1. **HTTP通信**:网络爬虫首先需要能够与服务器进行交互,获取网页内容...
【Java实现的网络爬虫(蜘蛛)源码详解】 网络爬虫是一种自动化程序,用于从互联网上抓取信息,构建索引,以便于数据分析或搜索引擎优化。本项目以Java为编程语言,提供了一套完整的网络爬虫源码,旨在帮助初学者和...
本教程以"java实现简单的爬虫小Demo"为主题,通过Java语言来实现一个基础的网络爬虫,目标是抓取智联招聘网站上的工作数据。下面我们将深入探讨相关知识点。 首先,Java作为一种跨平台的编程语言,其丰富的库支持...
该院吗详细的写出lJava网络爬虫蜘蛛源码,可以很好的帮助你实现爬虫,对了解爬虫的整个过程和实现爬虫非常有用
【描述】:本文将详细介绍如何在Hadoop环境中利用Java实现一个网络爬虫。我们将从配置环境开始,包括Cygwin、Hadoop的设置,然后是Eclipse开发环境的搭建,以及使用Jsoup库进行网络数据抓取。整个过程涵盖了数据的...
Java代码 实现 搜索链接 网络爬虫(蜘蛛) (内附源码 + 使用说明) Java代码 实现 搜索链接 网络爬虫(蜘蛛) (内附源码 + 使用说明) Java代码 实现 搜索链接 网络爬虫(蜘蛛) (内附源码 + 使用说明) Java代码 实现 ...
在这个"Java网络爬虫源码"压缩包中,我们可以期待找到用Java语言编写的网络爬虫程序的详细实现。 网络爬虫的核心功能包括: 1. **URL管理**:爬虫首先需要一个起始URL,然后通过跟踪页面上的链接来发现新的URL。这...
此资源集包含了一套完整的Java实现的网络爬虫(蜘蛛)项目源代码、相关的毕业论文以及详尽的使用说明。它旨在提供一个全面、深入的学习和研究工具,适用于本科课程设计、毕业设计以及任何希望深入学习Java编程的学习者...
本项目提供的就是一个基于Java实现的网络爬虫,旨在帮助初学者和有经验的开发者更好地理解和应用网络爬虫技术。 首先,让我们了解什么是网络爬虫。网络爬虫(也称为Web蜘蛛或网络机器人)是一种自动浏览互联网并...