高分求一个 抓取网页数据软件
个抓取网页数据软件(网页源码中产品的图片 价格 描述 重量 等等信息)
------解决方案--------------------去下载一个小偷程序看看,看完基本就知道怎么写了....
------解决方案--------------------火车头采集器
------解决方案--------------------获得html源码,正则分析源码并提取所需要的。
会写正则就好办了
------解决方案--------------------顶,也顺便学习下!
------解决方案--------------------
网页抓取类:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Text;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Collections;
using System.IO.Compression;
public class webCrawl
{
   public webCrawl() { }
   //获取网页字符根据url 
   public static string getHtml(string url)
   {
       try
       {
           string str = "";
           Encoding en = Encoding.GetEncoding(getEncoding(url));
           HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
           request.Headers.Set("Pragma", "no-cache");
           request.Timeout = 30000;
           HttpWebResponse response = (HttpWebResponse)request.GetResponse();
           if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
           {
               Stream strM = response.GetResponseStream();
               StreamReader sr = new StreamReader(strM, en);
               str = sr.ReadToEnd();
               strM.Close();
               sr.Close();
           }
           return str;
       }
       catch
       {
           return String.Empty;
       }
   }
   //获取编码
   public static string getEncoding(string url)
   {
       HttpWebRequest request = null;
       HttpWebResponse response = null;
       StreamReader reader = null;
       try
       {
           request = (HttpWebRequest)WebRequest.Create(url);
           request.Timeout = 30000;
           request.AllowAutoRedirect = false;
           response = (HttpWebResponse)request.GetResponse();
           if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
           {
               if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                   reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
               else
                   reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);
               string html = reader.ReadToEnd();
               Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");
               if (reg_charset.IsMatch(html))
               {
                   return reg_charset.Match(html).Groups["charset"].Value;
               }
               else if (response.CharacterSet != string.Empty)
               {
                   return response.CharacterSet;
               }
               else
                   return Encoding.Default.BodyName;
           }
       }
       catch (Exception ex)
       {
           throw new Exception(ex.Message);
       }
       finally
       {
           if (response != null)
           {
               response.Close();
               response = null;
           }
           if (reader != null)
               reader.Close();