日期:2014-05-19  浏览次数:20970 次

求C#提取网页正文内容代码
哪位大虾有C#提取网页正文内容的代码,可不可以发上来我参考参考。谢谢啦!!

------解决方案--------------------
public static int saveHtmlFile(string url,string filename)
{
int status = -1;
string respHTML = string.Empty;
StreamWriter sw = null;
try
{
if(ReadHttp(url,ref respHTML)== "OK ")
{
if(File.Exists(filename))
{
File.Copy(filename,filename+ ".bak ",true);
}
sw = new StreamWriter(filename,false,Encoding.GetEncoding( "GB2312 "));
sw.WriteLine(respHTML);
sw.Close();
status = 0;
}
else
{
System.Web.HttpContext.Current.Response.Write( "找不到该页或服务器错误 ");
}
}
catch(Exception err)
{
System.Web.HttpContext.Current.Response.Write(err.Message);
status = -1;
}
finally
{
if (sw != null)
{
sw.Close();
}
}
return(status);
}

public static string ReadHttp(string url,ref string content)
{
string status= "ERROR ";
HttpWebRequest Webreq = (HttpWebRequest) WebRequest.Create(url);
HttpWebResponse Webresp=null;
StreamReader strm = null;
try
{
Webresp = (HttpWebResponse) Webreq.GetResponse();
status = Webresp.StatusCode.ToString();
strm = new StreamReader(Webresp.GetResponseStream(),Encoding.GetEncoding( "GB2312 "));
content = strm.ReadToEnd();
}
catch
{
}
finally
{
if(Webresp != null) Webresp.Close();
if(strm != null) strm.Close();
}
return(status);
}