Java读取Html文本解析email地址的代码
代码来自:http://blog.csdn.net/javaalpha/article/details/8332587
package com.alpha.test;import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.io.Writer; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.regex.Matcher; import java.util.regex.Pattern;/** * 读取html页面文件解析邮箱地址 * * @author JavaAlpha 2012-12-19 13:45:11 */ public class ReadHtmlToTxt { // 读取文件 public static String readHtml(String path) { StringBuffer emailCont = new StringBuffer(); File htmlFile = new File(path); if (htmlFile.exists() && htmlFile.isFile() && htmlFile.canRead()) { Reader in; try { in = new FileReader(htmlFile); char[] buff = new char[4096]; int nch; while ((nch = in.read(buff, 0, buff.length)) != -1) { emailCont.append(checkEmail(new String(buff, 0, nch))); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } return emailCont.toString(); } // 判断字符串里面是否包括@符号 public static String checkEmail(String str) { String postCont = ""; // 判断是否回复的内容 if (str.indexOf("@") > -1) { postCont = str.substring(str.indexOf("@") - 10, str.indexOf("@") + 10); if (postCont.indexOf(">") > -1 || postCont.indexOf("<") > -1) { postCont = postCont.replaceAll(">", ""); postCont = postCont.replaceAll("<", ""); postCont = postCont.replaceAll("/", ""); } if (postCont.indexOf(",") > -1 || postCont.indexOf(",") > -1 || postCont.indexOf("。") > -1 || postCont.indexOf(";") > -1) { postCont = postCont.replaceAll(",", ""); postCont = postCont.replaceAll(",", ""); postCont = postCont.replaceAll("。", ""); } postCont = postCont.substring(0, postCont.indexOf(".com") + 4); System.out.println(postCont); } return postCont; } //过滤汉字 public static boolean checkChinese(String str) { String regEx = "[\\u4e00-\\u9fa5]"; Pattern p = Pattern.compile(regEx); Matcher m = p.matcher(str); if (m != null && m.find()){ return true;//是汉字 } return false; } // 将整理是邮箱地址写入文件 public static void writerFile(String cont, String path) { File emailFile = new File(path); try { //如果文件不存在,创建文件 if (!emailFile.exists()) { emailFile.createNewFile(); } Writer out = new FileWriter(emailFile); out.write(cont); out.flush(); out.close(); } catch (Exception e) { e.printStackTrace(); } } /** * 读取网络内容 */ public static void readUrlCont(String strUrl) { StringBuffer cont = new StringBuffer();//内容 try { URL url = new URL(strUrl); URLConnection conn = url.openConnection(); BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream())); String lineCont = ""; while ((lineCont = reader.readLine())!= null) { cont.append(lineCont+"</br>"); } reader.close(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } System.out.println(cont.toString()); } public static void main(String[] args) { //String cont = readHtml("e://test.htm");//读取文件 //writerFile(cont, "e://test.txt");//写文件 //checkChinese("qwe123"); readUrlCont("http://www.163.com"); }}