日期:2014-05-18  浏览次数:20630 次

网页抓取 抓取utf-8会乱码怎么解决 谢谢各位 ~!
如题 谢谢各位了先~~~抓取这个页面会乱码 但是抓取gb2312正常怎么解决呢?

public class test1 {
private String downloadPage(URL pageUrl) {
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(pageUrl.openStream()));
String chasr="";
String line;
StringBuffer pageBuffer = new StringBuffer();
while ((line = reader.readLine()) != null) {
pageBuffer.append(line);
}
System.out.print(pageBuffer.toString());  
return pageBuffer.toString();

} catch (Exception e) {
e.printStackTrace();
}
return null;
}
private URL verifyUrl(String url) {
if (!url.toLowerCase().startsWith("http://"))
return null;
URL verifiedUrl = null;
try {
verifiedUrl = new URL(url);
} catch (Exception e) { 
return null;
}
return verifiedUrl;
}
public static void main(String agr[]){
test1 st=new test1();
String url="http://j.peopledaily.com.cn/94476/94637/6524482.html"; 
URL pageUrl = st.verifyUrl(url);  
st.downloadPage(pageUrl);
}
}

------解决方案--------------------
new InputStreamReader(pageUrl.openStream(), "UTF-8")