爬虫 - - ITeye博客

wx1569110409

浏览: 20049 次

最近访客更多访客>>

caokuang_0

jzhfmm

zhangly2011

博主相关

博客

微博

相册

留言

关于我

文章分类

全部博客 (96)

社区版块

存档分类

2019-09 ( 96)
更多存档...

爬虫

1. [代码]主程序

public class Demo {

@SuppressWarnings ( "static-access" )

public static void main(String[] args) {

MyCrawler crawler = MyCrawler.getInstance();

crawler.setUrl( "http://docs.oracle.com/javase/8/docs/api/" );

crawler.setDir( "/api2" );

crawler.setDeep( 3 );

crawler.setThread( 1 );

crawler.start();

}

2. [代码]数据参数处理

public class MyCrawler {

private static String url;

private static int deep = 4 ;

private static int topN = 10 ;

private static int thread = 3 ;

private static String host;

private static String dir = System.getProperty( "user.dir" );

private static MyCrawler crawler = new MyCrawler();

public static MyCrawler getInstance(){

return crawler;

}

private MyCrawler(){}

public static int getDeep() {

return deep;

}

public static void setDeep( int deep) {

MyCrawler.deep = deep;

}

public static int getTopN() {

return topN;

}

public static void setTopN( int topN) {

MyCrawler.topN = topN;

}

public static String getUrl() {

return url;

}

public static void setUrl(String url) {

MyCrawler.url = url;

if (url.endsWith( ".html" )){

host = url.substring( 0 , url.lastIndexOf( "/" ));

} else {

MyCrawler.host = url;

}

public static String getHost() {

return host;

}

public static String getDir() {

return dir;

}

public void start() {

UrlObject obj = new UrlObject(url);

obj.setIdeep( 1 );

QueryCrawler.push(obj);

CrawlerWriterFiles writer = new CrawlerWriterFiles();

writer.open();

}

public static void setDir(String dir) {

MyCrawler.dir += dir+ "\\" ;

}

public static int getThread() {

return MyCrawler.thread;

}

public static void setThread( int thread) {

MyCrawler.thread = thread;

}

3. [代码]url对象

public class UrlObject {

private String url;

private int ideep;

public UrlObject(String url) {

this .url = url;

}

public String getUrl() {

return url;

}

public void setUrl(String url) {

this .url = url;

}

public int getIdeep() {

return ideep;

}

public void setIdeep( int ideep) {

this .ideep = ideep;

}

public UrlObject(String url, int ideep) {

this .url = url;

this .ideep = ideep;

}

4. [代码]url任务队列

public class QueryCrawler {

private static QueryCrawler query = new QueryCrawler();

private static ArrayList<UrlObject> list = new ArrayList<UrlObject>();

private QueryCrawler(){}

public static QueryCrawler getInstance() {

return query;

}

public synchronized static void push(UrlObject obj) {

list.add(obj);

}

public synchronized static void push(List<UrlObject> objs) {

list.addAll(objs);

}

public synchronized static UrlObject pop() {

if (list.size() < 1 )

return null ;

return list.remove( 0 );

}

5. [代码]线程遍历抓取，存储

public class CrawlerWriterFiles {

public void open() {

for ( int i = 0 ; i < MyCrawler.getThread(); i++) {

new Thread( new Runnable() {

public void run() {

while ( true ){

try {

DefaultHttpClient client = new SystemDefaultHttpClient();

final UrlObject obj = QueryCrawler.pop();

if (obj != null ){

HttpPost httpPost = new HttpPost(obj.getUrl());

HttpResponse response = client.execute(httpPost);

final String result = EntityUtils.toString(response.getEntity(), "UTF-8" );

if (obj.getIdeep() < MyCrawler.getDeep() && !obj.getUrl().endsWith( ".css" )){

CrawlerUtil.addUrlObject(obj, result);

}

new Thread( new Runnable() {

public void run() {

try {

CrawlerUtil.writer(obj.getUrl(), result);

} catch (IOException e) {

System.err.println( "输出错误url:" +obj.getUrl());

}

}).start();

} else {

System.out.println( "--------暂时没有任务！！" );

Thread.sleep( 5000 );

}

} catch (Exception e) {

e.printStackTrace();

System.err.println( "error" );

}

}).start();

}

6. [代码]抓取url,存储页面数据

100

101

102

103

104

105

106

107

108

109

public class CrawlerUtil {

private static List<String> arrays = new ArrayList<String>();

private static List<String> filearrays = new ArrayList<String>();

static {

String a = ",[]'\"+:;{}" ;

String[] as = a.split( "" );

for ( int i = 0 ; i < as.length; i++) {

if (as[i].equals( "" )){

continue ;

}

arrays.add(as[i]);

}

filearrays.add( "?" );

filearrays.add( "=" );

//filearrays.add(".");

}

public static void writer(String url, String data) throws IOException {

File file = null ;

if (url.toLowerCase().endsWith( ".css" )){

file = new File(getPathCSS(url));

} else {

file = new File(getPathHTML(url));

}

System.out.println(file.getPath());

if (!file.getParentFile().exists()){

file.getParentFile().mkdirs();

}

if (!file.exists()){

byte [] datab = data.getBytes();

FileOutputStream f = new FileOutputStream(file);

f.write(datab, 0 , datab.length);

f.close();

}

private static String getPathHTML(String url) {

if (url.equals(MyCrawler.getHost())){

url += "index" ;

}

if (!url.endsWith( "html" )){

if (url.endsWith( "/" )){

url+= "index.html" ;

} else if (url.lastIndexOf( "/" ) < url.lastIndexOf( "." )) {

url = url.substring( 0 , url.lastIndexOf( "." )) + ".html" ;

} else {

url += ".html" ;

}

if (url.startsWith( "http://" )){

url = MyCrawler.getDir() + url.replace(MyCrawler.getHost(), "" );

}

for ( int i = 0 ; i < filearrays.size(); i++) {

url = url.replaceAll( "\\" +filearrays.get(i)+ "" , "_" );

}

return url;

}

private static String getPathCSS(String url) {

if (url.startsWith( "http://" )){

url = MyCrawler.getDir() + url.replace(MyCrawler.getHost(), "" );

}

return url;

}

public static void addUrlObject(UrlObject obj, String result) {

//"<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]"

Pattern pcss =Pattern.compile( "<link.*href\\s*=\\s*\"?(.*?)[\"|>]" ,Pattern.CASE_INSENSITIVE);

addUrlObjToPattern(pcss, obj, result);

Pattern pa =Pattern.compile( "<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]" ,Pattern.CASE_INSENSITIVE);

addUrlObjToPattern(pa, obj, result);

Pattern pframe =Pattern.compile( "<frame\\s+src\\s*=\\s*\"?(.*?)[\"|>]" ,Pattern.CASE_INSENSITIVE);

addUrlObjToPattern(pframe, obj, result);

}

private static void addUrlObjToPattern(Pattern p, UrlObject obj,

String result) {

Matcher m = p.matcher(result);

ArrayList<UrlObject> urlobjs = new ArrayList<UrlObject>();

while (m.find()){

String link = m.group( 1 ).trim();

//urlobjs.add(new UrlObject(link, 1+obj.getIdeep()));

if (!isLink(link)){

continue ;

}

if (link.startsWith(MyCrawler.getHost())){

urlobjs.add( new UrlObject(link, 1 +obj.getIdeep()));

} else if (!link.contains( "://" )){

urlobjs.add( new UrlObject(MyCrawler.getHost() + link, 1 +obj.getIdeep()));

}

QueryCrawler.push(urlobjs);

show(urlobjs);

}

private static void show(ArrayList<UrlObject> urlobjs) {

/*for (int i = 0; i < urlobjs.size(); i++) {

System.out.println(urlobjs.get(i).getUrl());

}*/

}

private static boolean isLink(String link) {

if ( null == link) return false ;

link = link.replace(MyCrawler.getHost(), "" );

for ( int i = 0 ; i < arrays.size(); i++) {

if (link.contains(arrays.get(i))){

return false ;

}

return true ;

}

7. [图片] 官网.png

8. [图片] 自己抓取得.png

转载于:https://my.oschina.net/Denniswang/blog/661926

分享到：

Kubernetes 存储资源 PV、PVC 和 Storage ... | Qt第三方圆形进度条-及其改进

2019-09-22 08:00
浏览 192
评论(0)
分类:互联网
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

爬虫

1. [代码]主程序

2. [代码]数据参数处理

3. [代码]url对象

4. [代码]url任务队列

5. [代码]线程遍历抓取，存储

6. [代码]抓取url,存储页面数据

7. [图片] 官网.png

8. [图片] 自己抓取得.png

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

爬虫

1. [代码]主程序

2. [代码]数据参数处理

3. [代码]url对象

4. [代码]url任务队列

5. [代码]线程遍历抓取，存储

6. [代码]抓取url,存储页面数据

7. [图片] 官网.png

8. [图片] 自己抓取得.png

评论

发表评论

相关推荐

最近访客更多访客>>