日期:2014-05-20  浏览次数:20746 次

java新闻抓取程序图片下载不全的问题
我做了个程序把新浪上的天气新闻抓过来存到本地,考虑访问速度问题,新闻中的图片也要保存到本地。
程序如下
Java code

package vnet.com.weather1;

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import vnet.com.update.Getdata;
/**
 * 正则方式抓取新浪天气新闻上的新闻
 * 地址http://weather.news.sina.com.cn/weather/news/index.html
 * @param args
 */
public class Newlist {
    private static final Log log = LogFactory.getLog(Newlist.class);
    /**
     * 测试
     * @param args
     */
    public  static void main(String args[]){
        Newlist n=new Newlist();
        String[] k=n.getNewList();
        for (int i=0;i<k.length;i++){
        System.out.println(k[i].replace("href=\"", "href=\"newinfo2.jsp?url="));
        }
        String[] m=n.getNewinfo("news/2008/1119/35261.html");
        for (int l=0;l<m.length;l++){        
            System.out.println(m[l]);    
        }
        
    }
    /**
     * 由url地址获得新闻内容string[]
     * 新闻中的图片下载到本地,文中新闻地址改成本地地址
     * @param url
     * @return
     */
    public String[] getNewinfo(String url){
        String URL="http://weather.news.sina.com.cn/"+url;
        //30是指取30段满足给出的正则条件的字符串,如果只找出10个,那数组后面的全为null
        String[] s = analysis("<p>(.*?)</p>" , getContent(URL) , 30);
        for (int i=0;i<s.length;i++){
            Pattern sp = Pattern.compile("src=\"(.*?)\"");
            Matcher matcher = sp.matcher(s[i]);
            if (matcher.find()){
                
                 String imageurl=analysis("src=\"(.*?)\"" , s[i] , 1)[0];
                 if(!imageurl.startsWith("http://")){
                     imageurl="http://weather.news.sina.com.cn/"+imageurl;
                  }
                System.out.println("新闻有图片:"+imageurl);
                String content=getContent(imageurl);
                  String[] images=imageurl.split("/");
                  String imagename=images[images.length-1];
                  System.out.println("图片名:"+imagename);
                  
                 
        try {
            File fwl = new File(imagename); 
            PrintWriter outl = new PrintWriter(fwl);
            outl.println(content);
            outl.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            System.out.println("s[i]:"+s[i]);
            //修改文件图片地址
            s[i]=s[i].replace(analysis("src=\"(.*?)\"" , s[i] , 1)[0], imagename);
            }
        }
        
        return s;
    }
    public  String[] getNewList(){
        String url="http://weather.news.sina.com.cn/weather/news/index.html";
        return getNewList(getContent(url));       
    }

    private  String[] getNewList(String content ){
        //String[] s = analysis("align=\"center\" valign=\"top\"><img src=\"../images/a(.*?).gif\" width=\"70\" height=\"65\"></td>" , content , 50);    
        String[] s = analysis("<li>(.*?)</li>" , content , 50);
        
        return s;
    }
    private String[] analysis(String pattern, String match , int i){
        Pattern sp = Pattern.compile(pattern);
        Matcher matcher = sp.matcher(match);
        String[] content = new String[i];
        for (int i1 = 0; matcher.find(); i1++){        
            content[i1] = matcher.group(1);       
        }
        //下面一段是为了剔除为空的串
        int l=0;
        for (int k=0;k<content.length;k++){
            if (content[k]==null){
                l=k;
                break;
            }
        }
        String[] content2;
        if (l!=0){
            content2=new String[l];
            for (int n=0;n<l;n++){
                content2[n]=content[n];
            }
             return content2;
        }else{
            return content;    
        }
       
    }
    /**
     * 由地址获取网页内容
     * @param strUrl
     * @return
    private String getContent(String strUrl){
        try{
            //URL url = new URL(strUrl);    
            //BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream()));
            URLConnection uc = new URL(strUrl).openConnection(); 
               //通过修改http头的User-Agent来伪装成是通过浏览器提交的请求
              uc.setRequestProperty("User-Agent",  
                                    "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");                  
              System.out.println("-----------------------------------------");  
              System.out.println("Content-Length:     "+uc.getContentLength());  
              System.out.println("Set-Cookie:     "+uc.getHeaderField("Set-Cookie"));  
              System.out.println("-----------------------------------------"); 
              //获取文件头信息
              System.out.println("Header"+uc.getHeaderFields().toString());
              System.out.println("-----------------------------------------");  
            BufferedReader br=new BufferedReader(new InputStreamReader(uc.getInputStream(), "gb2312")); 
            String s = "";
            StringBuffer sb=new StringBuffer();
            while((s = br.readLine())!=null){
                sb.append(s+"\r\n");
            }
            System.out.println("长度+"+sb.toString().length());
            
            return sb.toString();
        }catch(Exception e){
            return "error open url" + strUrl;
        }
    }
    */

    public static  String getContent (String strUrl){
        URLConnection uc = null;
        String all_content=null;

       
    try {
               all_content =new  String();
               URL url = new URL(strUrl);

               uc = url.openConnection();
               uc.setRequestProperty("User-Agent",  
                                    "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");                  
              System.out.println("-----------------------------------------");  
              System.out.println("Content-Length:     "+uc.getContentLength());  
              System.out.println("Set-Cookie:     "+uc.getHeaderField("Set-Cookie"));  
              System.out.println("-----------------------------------------"); 
              //获取文件头信息
              System.out.println("Header"+uc.getHeaderFields().toString());
              System.out.println("-----------------------------------------");  
               if (uc == null)
                   return null;

               InputStream ins = uc.getInputStream();
                ByteArrayOutputStream outputstream = new ByteArrayOutputStream();
               byte[] str_b = new byte[1024];
                   int i = -1;
                   while ((i=ins.read(str_b)) > 0) {
                    outputstream.write(str_b,0,i);
                   }
                   all_content = outputstream.toString();
                  // System.out.println(all_content);

           } catch (Exception e) {
               e.printStackTrace();
               log.error("获取网页内容出错");
           }finally{
               uc = null;
           }
          
          // return new String(all_content.getBytes("ISO8859-1"));
           System.out.println(all_content.length());
           return all_content;
       }
      
}