日期:2014-05-20  浏览次数:20842 次

j2se 抓取网页上图片
[align=left]
package com.lee.test;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;

/**
 * 
 * @author Lee
 * 本来有很多想完善呢  其他的好像都是功夫问题了  
 *
 */
public class GetImagesFromWeb {
	
	// 默认图片大小至少为1k
	private static long size = 1 ;       
	// 默认图片存放位置
	private static File folder = null ; 
	// 默认文件扩展名
	private static List<String> exts= new ArrayList<String>() ;
	// 是否从一个网页上抓取
	private static boolean isSingle = true ;
	// 文件计数器
	private static long counter = 0 ;
	// 文件名前缀
	private static String fileNameSuffix = "default_filename_suffix_" ;
	
	static{
		String path = "C:\\Documents and Settings\\Administrator\\桌面\\MyImagesFolder" ;
		folder = new File(path) ;
		if(!folder.exists()){
			folder.mkdir() ;
		}
		exts.add("jpeg");
		exts.add("jpg") ;
		exts.add("gif") ;
	}
	
	private static String getExtName(String url){
		return url.substring(url.lastIndexOf(".")+1) ;
	}
	
	private static List<String> getImageUrls(String url){
		URL u = null;
		boolean flag = false ;
		try {
			u = new URL(url);
		} catch (MalformedURLException e) {
			System.out.println(url+"  不合法!");
			flag = true ;
		}
		if(flag) return null;
		List<String> urls = new ArrayList<String>() ;
		URLConnection connection = null;
		try {
			connection = u.openConnection();
		} catch (IOException e) {
			System.out.println("网络连接错误!");
			flag =true ;
		}
		if(flag) return null;
		BufferedReader br = null;
		try {
			br = new BufferedReader(new InputStreamReader(connection.getInputStream()));
		} catch (IOException e) {
			System.out.println("IO设备错误");
			flag =true ;
		}
		if(flag) return null;
		String line = null ;

		try {
			while((line = br.readLine() ) != null ){
					while(line.contains("<img")){
						int imgIndex = line.indexOf("<img") ;
						int first =  line.indexOf("\"", imgIndex) ; 
						if(first == -1 ) continue ;
						int second = line.indexOf("\"", first+1);
						if(second == -1 ) continue ;
						String t = line.substring(first+1,second) ;
						// url 中可能传递参数
						if(t.indexOf('?') > -1)
						   t = t.substring(0, t.indexOf('?')) ;
						urls.add(t) ;
						line = line.substring(second) ;
					}
			}
		} catch (IOException e) {
			System.out.println("流读写错误");
			flag = true ;
		}
		return urls ;
	}
	
	public static void getImagesFromSinglePage(String url) {
		URL u =  null;;
		InputStream is = null ;
		FileOutputStream fos = null ;
		List<String> urls = getImageUrls(url) ;
		if(urls.size() < 1 ) return ;
		boolean flag = false ;
		for(String ur : urls ){
			flag = false ;
			try {
				u = new URL(ur) ;
			} catch (MalformedURLException e) {
				System.out.println(ur+ "不合法!");
				flag = true ;
			}
			if(flag) continue ;
			URLConnection connection = null;
			try {
				connection = u.openConnection();
			} catch (IOException e) {
				System.out.println("IO 错误!");
				flag =true ;
			}
			if(flag) continue ;
		    try {
				is = connection.getInputStream() ;
			} catch (IOException e) {
				System.out.println("IO 错误!");
				flag = true ;
			}
			if(flag) continue ;
			File file = new File(folder,fileNameSuffix+( counter++)+"."+getExtName(ur)) ;
			if(!file.exists())
				try {
					file.createNewFile() ;
				} catch (IOException e) {
					System.out.println("建立文件"+file.getAbsolutePath()+" 失败!");
					flag = true ;
				}
				if(flag) continue ;
		    try {
				fos = new FileOutputStream(file) ;
			} catch (FileNotFoundException e) {
				System.out.println("文件 "+file.getAbsolutePath()+"不存在!");
				flag =true ;
			}
			if(flag) continue ;
			byte[] b = new byte[1024] ;
			int len = 0 ;
			try {
				while((len = is.read(b, 0, 1024)) > 0 ){
					fos.write(b, 0, len) ;
				}
				fos.flush() ;
			} catch (IOException e) {
				System.out.println("IO错误!");
			}
			System.out.println(file.getName()+" 获取成功!");
		}
		try {
			if(fos != null ){
				fos.close() ;
				fos =null ;
			}
			if(is != null ){
				is.close() ;
				is =null ;
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	public static void batchGetImages(String urlFirst,String urlLast,int beginIndex , int endIndex){
		for(int i = beginIndex ; i <= endIndex ;  i++ ){
			getImagesFromSinglePage(urlFirst+i+urlLast) ;
		}
	}
	
	public static void main(String[] args) {
		// 获取一个网页的所有图片
		//getImagesFromSinglePage("http://www.qiushibaike.com/new2/pic/20/page/6/") ;
		
		// 获取地址有数字特征规律的所有网页的图片
		// 以网址 http://www.qiushibaike.com/new2/pic/20/page/350/ 为例
		// 数字350 代表第350页  如果是1则代表的是第一页
		// 下面获取第一页到第三十页的所有图片
		// 大概获取600 张图片
		batchGetImages("http://www.qiushibaike.com/new2/pic/20/page/","/", 1, 30) ;
	}

}

[/align]