爬虫基础类

笨小孩在早起

浏览: 47430 次

最近访客更多访客>>

gundammew

zwj1030711290

sunjunblack

qq849397558

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

技术技巧

linux java 爬虫

自己封装的爬虫基础类。

public interface TaskBaseInfo {
	
	/**
	 * 返回任务的名称.
	 * <br/>
	 * 一般用作日志输出
	 * @return
	 */
	String taskName();

	/**
	 * 返回任务的唯一code
	 * @return 在整个爬虫项目中不重复的Code值
	 */
	String taskCode();
}

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;

import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import redis.clients.jedis.JedisCluster;

public interface TaskStringCache {

	Logger logger = LoggerFactory.getLogger(TaskStringCache.class);
	
	String BASE_FILE_PATH = "/mfs/ShareFile/static/cms/crawler/cache/";
	
	JedisCluster obtainJedisCluster();
	
	String getCacheStr(String taskCode, String cacheKey);

	void setCacheStr(String taskCode, String cacheKey, int cacheSeconds, String cacheStr);
	
	
	
	default String obtainTargetFilePath(String taskCode, String cacheKey) {
		
		return BASE_FILE_PATH + taskCode + File.pathSeparator + cacheKey + ".properties";
	}
	
	/**
	 * 设置缓存的默认方法
	 * @param taskName 任务中文名,日志使用
	 * @param taskCode 任务code需保持唯一性
	 * @param cacheKey 缓存的key
	 * @param cacheStr 缓存的值
	 */
    default void defaultSetCacheStr(String taskName, String taskCode, String cacheKey, int cacheSeconds, String cacheStr) {

        JedisCluster jedisCluster = obtainJedisCluster();
        jedisCluster.setex(cacheKey, cacheSeconds, cacheStr);
        String targetFilePath = obtainTargetFilePath(taskCode, cacheKey);
        save2FileAtomic(taskName, targetFilePath, cacheStr);
    }

    /**
     * 获取通过【设置缓存的默认方法】{@link #defaultSetCacheStr(String, String, String, String)}设置的缓存
     * @param taskName 任务中文名,日志使用
     * @param taskCode 任务code需保持唯一性
     * @param cacheKey 缓存的key
     * @return
     */
    default String defaultGetCacheStr(String taskName, String taskCode, String cacheKey) {

        JedisCluster jedisCluster = obtainJedisCluster();
        String cacheStr = jedisCluster.get(cacheKey);
        if (StringUtils.isNotBlank(cacheStr)) {

            return cacheStr;
        }

        String targetFilePath = obtainTargetFilePath(taskCode, cacheKey);
        try {
        	// 没利用到多少异步的优势，执行异步操作后马上获取结果还是会阻塞
            cacheStr = readFile(targetFilePath).get();
        } catch (InterruptedException | ExecutionException e) {
			
        	logger.error("【" + taskName + "】 执行异步获取文件缓存内容时失败. taskCode=>" + "【" + taskCode + "】" + " cacheKey=>" + "【" + cacheKey + "】");
        	logger.error(e.getMessage());
		}

        return cacheStr;
    }

    /**
     * 通过文件持久化爬取的游标Id，避免在数据增加字段
     * 文件写入操作较慢，异步执行
     * 原子操作,避免写入和读取的并发问题
     *
     * @param filePath
     * @return
     */
    default void save2FileAtomic(String taskName, String filePath, String content) {

        CompletableFuture.runAsync(() -> {

			File tmpFile = new File(filePath + ".tmp");
			try {

				if (tmpFile.exists() == false) {

					tmpFile.getParentFile().mkdirs();
					tmpFile.createNewFile();
				}
				try (FileWriter fw = new FileWriter(tmpFile)) {

					fw.write(content);
					fw.flush();
				}

			} catch (IOException e) {

				logger.error("【" + taskName + "】 => 写入缓存字符串到文件 => 【" + tmpFile + "】 时异常	\n" + e.getMessage());
				logger.error("【" + taskName + "】 文件写入操作退出");
				if (tmpFile.exists()) {
					tmpFile.delete();
				}
				return;
			}

			if (tmpFile.exists() == false) {

				return;
			}
            
            // 此段注释针对windows系统在同一个文件系统内且是同一个盘符下已经有一个目标文件；
            // 下面的renameTo操作会失败，造成无限递归调用进而 【栈溢出】 异常
            // 在Linux运行的情况下，可暂时先注释掉，测试没问题后上线
            // 注释开始段
//			File destFile = new File(filePath);
//			if (destFile.exists()) {
//				destFile.delete();
//			}
            // 注释结束段

            if (tmpFile.renameTo(new File(filePath))) {

                tmpFile.delete();
            } else {

            	logger.error("move fails filePath:" + filePath);
                tmpFile.delete();
                this.save2FileAtomic(taskName, filePath, content);
                // 当在Linux某个发行版下测试时，renameTo操作出错的话，可不硬性要求原子操作，
                // 可将上面的原子操作注释掉，改为下面的操作
//				save2File(filePath, content);
            }
        });
    }

//	default void save2File(String filePath, String content) throws IOException {
//		
//		try (FileWriter fw = new FileWriter(new File(filePath))) {
//
//			fw.write(content);
//			fw.flush();
//		}
//	}

    /**
     * 异步读取文件内容
     *
     * @param filePath
     * @return
     * @throws IOException
     * @throws FileNotFoundException
     */
	default CompletableFuture<String> readFile(String filePath) {

		return CompletableFuture.supplyAsync(() -> {

			StringBuilder strb = new StringBuilder();
			try (FileInputStream fis = new FileInputStream(filePath); 
					BufferedReader inReader = new BufferedReader(new InputStreamReader(fis));) {

				String line = StringUtils.EMPTY;
				while ((line = inReader.readLine()) != null) {
					strb.append(line);
				}
			} catch (IOException e) {

				logger.error(e.getMessage());
				return StringUtils.EMPTY;
			}

			return strb.toString();
		});
	}
}


public interface BasicTask {

	void run();
}

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.BeansException;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;

import com.xxx.zx.crawler.basic.BasicTask;

public abstract class BaseCrawlerTask implements TaskBaseInfo, TaskStringCache, BasicTask, ApplicationContextAware {

	protected final Logger logger = LoggerFactory.getLogger(getClass());
	
	protected static ApplicationContext ac;
	
	@Override
	public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
		
		ac = applicationContext;
	}
	
	public synchronized static <T> T getBean(Class<T> beanClass) {
		
		return ac.getBean(beanClass);
	}
	
	public synchronized static Object getBean(String beanName) {
		
        return ac.getBean(beanName);
    }  
	
	@Override
	public String getCacheStr(String taskCode, String cacheKey) {
		
		return defaultGetCacheStr(taskName(), taskCode, cacheKey);
	}

	@Override
	public void setCacheStr(String taskCode, String cacheKey, int cacheSeconds, String cacheStr) {
		
		defaultSetCacheStr(taskName(), taskCode, cacheKey, cacheSeconds, cacheStr);
	}
}

1
顶

0
踩

分享到：

方便jedis cluster操作的工具类 | 基于AOP的ajax的referrer判断

2018-03-15 17:28
浏览 825
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

爬虫基础类

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

爬虫基础类

评论

发表评论

相关推荐

简单的压测模拟

Java的驼峰与下划线的属性对象互相转换

Elastic Search搜索实例

针对基于Redis Cluster的接口数据缓存删除实现

简单ELK配合logback搭建日志监控中心

spring的基于java的项目配置示例2

HttpClient实例

spring的基于java的项目配置示例1

基于spring data的Elastic Search的配置示例

方便jedis cluster操作的工具类

基于AOP的ajax的referrer判断

Java Timestamp从MySQL数据库取出的字符串转换为LocalDateTime

reviewC指针

Python2.X内置函数学习

学习Python中遇到的问题

最近访客更多访客>>