java之 25天网络爬虫抓取图片(二)

Luob.

浏览: 1600713 次
来自: 上海

最近访客更多访客>>

Jameslyy

apex53

smith6851

xiaoliefengfeng

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

Java

正则表达式网络爬虫抓取图片多线程下载

正则表达式练习


import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.TreeSet;

public class RegexTest {

	
	public static void main(String[] args) {
		//test();
		ipSort();
	}
	/**
	 * 需求:对邮件地址进行校验
	 */
	public static void checkMail(){
		String mail="asdd@sina.com";
		String regex="\\w{6,12}@[a-zA-Z0-9]+(\\.[a-zA-Z]+){1,3}";  //较为精确的匹配,
		regex="\\w+@\\w+(\\.\\w+)+";  //相对不太精确的匹配  注册后,发送激活邮件
		
		System.out.println(mail.matches(regex));
	}
	
	/**
	 * 需求:
	 * 将虾类字符串转换成, 我要学编程
	 * 到底用四种功能中的那一个呢,或者哪几个呢?
	 * 思路方式:
	 * 1.如果指向知道该字符串是否对是错,使用匹配.
	 * 2.想要将已有的字符串变成另一个字符串,替换
	 * 3.想要安装指定的方式将字符串变成多个字符串,切割. 获取匹配规则以外的子串
	 * 4.想要拿到符合要求的字符串子串, 获取. 获取符合规则的子串.
	 */
	public static void test(){
		String str="我我....我我...我要....要要.....要要...学学学..学学.编程....程程...程";
		
		/**
		 * 将已有字符串变成一个字符串, 使用 替换功能.
		 * 1.可以先去掉 " ."
		 * 2.在将多个充分度的内容变成单个内容.
		 */
		str=str.replaceAll("\\.+", "");
		System.out.println(str);
		
		str=str.replaceAll("(.)\\1+", "$1");
		System.out.println(str);
		
	}
	
	/**
	 * 192.68.1.254 102.49.23.013 10.10.10.10 2.2.2.2 8.109.90.30
	 * 将IP地址进行地址段顺序的排序.
	 * 还按照字符串自然顺序,只要让他们每一段都是3位即可
	 * 1.按照每一段需要的最少多0进行补齐,那么每一段就会至少保证有3位
	 * 2.将每一段只保留3位,这样,所有的Ip地址都是每一段3位.
	 */
	public static void ipSort(){
		String ip="192.68.1.254 102.49.23.013 10.10.10.10 2.2.2.2 8.109.90.30 127.0.0.1";
		ip=ip.replaceAll("(\\d+)", "00$1");
		System.out.println(ip);
		ip=ip.replaceAll("0*(\\d{3})", "$1");
		System.out.println(ip);
		String[] arr=ip.split(" +");
		//方法一
		//Arrays.sort(arr);
		
		//方法二
		TreeSet<String> ts=new TreeSet<String>();
		for(String s:arr){
			ts.add(s);
		}
		for(String s:ts){
			System.out.println(s.replaceAll("0*(\\d+)", "$1"));
			//System.out.println(s.replaceAll("0*([1-9]\\d*)", "$1"));
		}
		
		//方法三
		List<String> list= Arrays.asList(arr);
		for (String s : list) {
			System.out.println(s);
		}
		Collections.sort(list);
	}
	
	
}

网络爬虫:(抓取莱伊份网站首页的图片,然后下载保存到本地)
抓取本地文件中的所有邮箱


import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 网页爬虫(蜘蛛)
 * 爬邮箱
 * 抓取 莱伊份 网站首页 的图片 然后保存的本地 C:\ 
 * 采用多线程进行下载 
 */
class downloadPic implements Runnable{
	private String picpath;
	private String path;
	downloadPic(String picpath,String dir){
		this.picpath=picpath;
		this.path=dir;
	}
	public void run(){
		try {
			URL url=new URL(picpath.replaceAll(" ", "%20"));
			URLConnection conn=url.openConnection();
			 //设置请求的路径  
			//conn.setConnectTimeout(5*1000);  
			int index=picpath.lastIndexOf("/");
			int index1=picpath.lastIndexOf("?");
			index1=index1==-1? picpath.length():index1;
			
			File dir=new File(path);
			if(!dir.exists())
				dir.mkdirs();
		
			String filename=path+picpath.substring(index,index1);
			
			BufferedInputStream bis=new BufferedInputStream(conn.getInputStream());
			
			BufferedOutputStream bos=new BufferedOutputStream(new FileOutputStream(new File(filename)));
			
			byte[] buf=new byte[1024];
			int len=0;
			while((len=bis.read(buf))!=-1){
				bos.write(buf, 0, len);
				bos.flush();
			}

			bis.close();
			bos.close();
			
		} catch (Exception e) {
			e.printStackTrace();
		}
		
	}
}

/**
 * 抓取 图片 
 * 抓取 本地文件中 所有 邮箱
 * @author Bin
 *
 */
public class RegexTest2 {

	/**
	 * @param args
	 * @throws IOException 
	 */
	public static void main(String[] args) throws IOException {
		//getMails();
		getMails_1();
	}

	public static void cashPicAndMail() throws IOException{
		
		URL url=new URL("http://www.laiyifen.com/");
		
		URLConnection conn= url.openConnection();
		BufferedReader bufr=new BufferedReader(new InputStreamReader(conn.getInputStream(),"utf-8"));
		
		String line=null;
		String mailReg="\\w+@\\w+(\\.\\w+)+";  //抓取网站中的邮箱正则
		mailReg="<img .*? ?src=\"(http:.*?)\".*?( /)?>"; //抓取图片正则
		
		//http://images3.laiyifen.com/laiyifen/2012/10211/10211_01_s.jpg?1332472469#h
		
		//<img src="http://images.laiyifen.com/themes/laiyifen2/images/wl110.jpg">
		//<img border="0" src="http://images.laiyifen.com/themes/laiyifen2/images/tribe_image01.jpg">
		//<img style="width:80px; height:80px;overflow:hidden;" src="http://images4.laiyifen.com/laiyifen/2012/10211/10211_01_s.jpg?1332472469#h" />
		
		Pattern p=Pattern.compile(mailReg);
		
		int count=0;
		String filedir="c:\\laiyifeng\\";
		while((line=bufr.readLine())!=null){
			//System.out.println(line);
			Matcher m=p.matcher(line);
			while(m.find()){
				String picurl=m.group(1);
				new Thread(new downloadPic(picurl,filedir)).start();
				count++;
			}
		}
		System.out.println("总计:"+count);
		
	}
	
	
	
	/**
	 * 获取指定文档中的邮件地址
	 * 使用个获取功能,Pattern  Matcher
	 * @throws IOException 
	 */
	public static void getMails() throws IOException{
		BufferedReader bufr=new BufferedReader(new FileReader("E:\\mail.txt"));
		String line=null;
		String mailReg="\\w+@\\w+(\\.\\w+)+";
		Pattern p=Pattern.compile(mailReg);
		
		while((line=bufr.readLine())!=null){
			Matcher m=p.matcher(line);
			while(m.find()){
				System.out.println(m.group());
			}
		}
	}
}

0
顶

3
踩

分享到：

PS 入门笔记 | java之 25天正则表达式(一)

2013-05-30 13:59
浏览 7083
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

java之 25天网络爬虫抓取图片(二)

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

java之 25天 网络爬虫抓取图片(二)

评论

发表评论

相关推荐

检测一个字符串是否在jvm的常量池中

UTC时间, GMT时间 ,夏令时

java 反射List

JDK1.5 Exchange 两个线程互换数据

JDK1.5 CountDownLatch

java CyclicBarrier 循环障碍阻塞

java 信号灯 Semaphore

java 使用读写锁设计一个缓存模型

java 读写锁

java 多个线程之间同步通信

jdk1.5 锁 Lock 和 Condition

JDK1.5 获取线程执行结果 Callable Future

JDK1.5 线程池

java 多线程ThreadLocal

java 定时器 Timer

java 多线程同步+通信

java 线程同步

java多线程练习

java 传统多线程

java 图片,剪切,缩放

最近访客更多访客>>

java之 25天网络爬虫抓取图片(二)