1、主要应用getContentType获取相应的网页编码方式:
pageUrl=new URL(urlString);
HttpURLConnection uc = (HttpURLConnection) pageUrl.openConnection();
String encoding=uc.getContentType();
2、再提取charset子串(这里使用"charset=",由于网页中的大小写不敏感,所以最好用正则表达式修改一下)
encoding=encoding.substring(encoding.indexOf("charset=")+8).trim();
//System.out.println("+"+encoding+"+");
// 创建网络流
BufferedReader reader=
new BufferedReader(new InputStreamReader(pageUrl.openStream(),encoding));
3、下面是源代码,注释的很清楚:
1 import java.io.BufferedReader;
2 import java.io.IOException;
3 import java.io.InputStreamReader;
4 import java.io.UnsupportedEncodingException;
5 import java.net.HttpURLConnection;
6 import java.net.MalformedURLException;
7 import java.net.URL;
9 public class PageString {
10 private StringBuffer strBuf=new StringBuffer();
11 private URL pageUrl=null;
12 public PageString(String urlString){
13 try {
14 //System.out.println(urlString);
15 pageUrl=new URL(urlString);
16 try {
17 //获取网页的编码方式,这里可以解决乱码问题
18 HttpURLConnection uc = (HttpURLConnection) pageUrl.openConnection();
19 String encoding=uc.getContentType();
20 encoding=encoding.substring(encoding.indexOf("charset=")+8).trim();
21 //System.out.println("+"+encoding+"+");
22 // 创建网络流
23 BufferedReader reader=
24 new BufferedReader(new InputStreamReader(pageUrl.openStream(),encoding));
25 String line;
26 // 读取网页内容
27 //new StringBuffer();
28 while((line=reader.readLine())!=null){
29 //System.out.println(line);
30 strBuf.append(line+"\t\n");
31 }
32 } catch (IOException e) {
33 // TODO Auto-generated catch block
34 e.printStackTrace();
35 }
36 } catch (MalformedURLException e) {
37 // TODO Auto-generated catch block
38 e.printStackTrace();
39 }
40 }
41 public StringBuffer getStrBuf() throws UnsupportedEncodingException {
42 //System.out.println(new String(strBuf.toString().getBytes("gb2312")).toString());
43 return this.strBuf;
44 }