日期:2014-05-20  浏览次数:20493 次

如何解析html网页?
请问哪位高手知道如何解析网页中自已想要的部分内容?我现在已经抓取到了网页,但是不知道如何提取自已想要的那部分内容

------解决方案--------------------
这个要用到 正则表达式
给你个列子

private string getString(string tmp,int i)
{
string temp = tmp.ToLower();
int _index = temp.IndexOf( " <font ");
for(int n = 1 ; n < i ; n++)
{
_index = temp.IndexOf( " </font> ",_index,temp.Length-_index-1)+7;
}
//Response.Write(_index);
temp = temp.Substring(_index,temp.IndexOf( " </font> ",_index,temp.Length-_index-1)-_index);
return System.Text.RegularExpressions.Regex.Replace(System.Text.RegularExpressions.Regex.Replace(temp, " <(.|[\f\n\r\t\v])+?> |[\n\r\t] ", " ",System.Text.RegularExpressions.RegexOptions.IgnoreCase), "[(&nbsp;)|( )]+ ", "&nbsp; ",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
}
private string GetContentFromUrll(string _requestUrl)
{
string _StrResponse = " ";
HttpWebRequest _WebRequest = ( HttpWebRequest )WebRequest.Create( _requestUrl );
_WebRequest.Method = "GET ";
WebResponse _WebResponse = _WebRequest.GetResponse();
StreamReader _ResponseStream = new StreamReader( _WebResponse.GetResponseStream(), System.Text.Encoding.GetEncoding( "gb2312 "));
_StrResponse = _ResponseStream.ReadToEnd();
_WebResponse.Close();
_ResponseStream.Close();
return _StrResponse;
}
//---
string tmp = GetContentFromUrll( "http://www.cma.gov.cn/netcenter_news/qxyb/city/index.php?city= "+ System.Web.HttpUtility.UrlEncode(ct,System.Text.Encoding.GetEncoding( "gb2312 ")));
//Response.Write(( "成都 "));
tmp = tmp.Substring(tmp.IndexOf( " <!--未来天气预报开始--> "),tmp.IndexOf( " <!--指数预报结束--> ")-tmp.IndexOf( " <!--未来天气预报开始--> "));
string dt = getString(tmp,1) + "| " + getString(tmp,2) + "| "+getString(tmp,3);
string wt = getString(tmp,5) + "| " + getString(tmp,6) + "| "+getString(tmp,7);
string wd = getString(tmp,9) + "| " + getString(tmp,10) + "| "+getString(tmp,11);
string wl = getString(tmp,13) + "| " + getString(tmp,14) + "| "+getString(tmp,15);
string wen = getString(tmp,17) + "| " + getString(tmp,18) + "| "+getString(tmp,19);
string wu = " ";
if(tmp.IndexOf( "污染指数 ") != -1)
{
wu = "污染指数: "+getString(tmp,21) + " <br/> \n "+ "紫外线指数: "+getString(tmp,23)+ " <br/> \n "+ "舒适度指数: "+getString(tmp,25)+ " <br/> \n "+ "穿衣指数: "+getString(tmp,27);
}