@zhou-si 2016-08-29T01:38:05.000000Z 字数 3909 阅读 1882

java对指定文件中汉字数字字母进行切分

java切词

初衷

最近项目组有个小需求：编辑组那边收集了一些电商产品信息，需要对其进行用指定分隔符连接，比如："三星GALAXY A4 移动4G" --> "三星@GALAXYA@4@移动@4@G"

代码实现(工具类)

package utils_split;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
 * 创建时间：20160828
 * 工具类
 * @author 圣斗士宙斯
 *
 */
public class Utils {
    // 判断字符串是否仅为数字
        public static boolean isNumeric(char cha) {
            if (Character.isDigit(cha)) {
                return true;
            }else {
                return false;
            }
        }
        // 判断字符是否是字母
        public static boolean word(char cha) {
            int i = (int) cha;
            if ((i >= 65 && i <= 90) || (i >= 97 && i <= 122)) {
                return true;
            } else {
                return false;
            }
        }
        //判断字符是否是中文
        public static boolean chinese(char str){
            String regEx = "[\\u4e00-\\u9fa5]";
            Pattern p = Pattern.compile(regEx);
            Matcher m = p.matcher(String.valueOf(str));
            if (m.find()) {
                return true;
            }
            else {
                return false;
            }
        }
}

代码实现(实现类)

package split;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import utils_split.Utils;
/**
 * 创建时间：20160828
 * 指定文件绝对路径 把文件中字母和字母链接的不替换，
 * 汉字和数字连接的中间要添加指定符号
 * 字母和数字连接的中间要添加指定符号
 * 汉字和字母连接的中间要添加指定符号
 * 数字和数字，字母和字母，汉字和汉字连接的中间不需要加指定符号
 * 需求源于挖掘组切词
 * @author 圣斗士宙斯
 * 
 */
public class JavaSplitFile {
    static String filePath = "";
    static String splitStr = "@";//默认为@
    public static void main(String[] args) {
        filePath = args[0];
        splitStr = args[1];
        long startTime = System.currentTimeMillis();
        File file = new File(filePath);
        File newFile = new File(filePath + "_new");
        if (newFile.exists()) {
            newFile.delete();
        } else {
            try {
                newFile.createNewFile();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        FileInputStream fis = null;
        InputStreamReader read = null;
        String lineText = null;
        // 判断文件是存在
        if (file.exists() && file.isFile()) {
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException e) {
                e.printStackTrace();
            }
            try {
                read = new InputStreamReader(fis, "gbk");
            } catch (UnsupportedEncodingException e1) {
                e1.printStackTrace();
            }
            BufferedReader br = new BufferedReader(read);
            FileWriter fw = null;
            String outPut = "";
            StringBuffer sBuffer = new StringBuffer();
            try {
                fw = new FileWriter(newFile, true);
            } catch (IOException e) {
                e.printStackTrace();
            }
            // 逐行读取，判断，替换
            try {
                while ((lineText = br.readLine()) != null) {
                    // 过滤掉中文括号后面的 （全网通） 和url
                    lineText = lineText.split("（")[0].split("http://")[0]
                            .replaceAll("\\s*", "");
                    int i = lineText.toCharArray().length;
                    String firstChar = String.valueOf(lineText.charAt(0));// 当前行的第一个字符
                    sBuffer.append(firstChar);// 刚开始遍历一行时就追加第一个字符
                    for (int n = 1; n <= i - 1; n++) {
                        char nowChar = lineText.charAt(n);// 当前字符
                        char suffStr = lineText.charAt(n - 1);// 前一个字符
                        // 如果当前字符是数字========================
                        if (Utils.isNumeric(nowChar)) {
                            // 如果前一个字符是字母
                            if (Utils.word(suffStr)) {
                                sBuffer.append(splitStr + String.valueOf(nowChar));
                            }
                            // 如果前一个字符是汉字
                            else if (Utils.chinese(suffStr)) {
                                sBuffer.append(splitStr + String.valueOf(nowChar));
                            }
                            // 如果前一字符是数字
                            else if (Utils.isNumeric(nowChar)) {
                                sBuffer.append(String.valueOf(nowChar));
                            }
                            // 如果当前字符是字母=========================
                        } else if (Utils.word(nowChar)) {
                            // 如果前一个字符是字母
                            if (Utils.word(suffStr)) {
                                sBuffer.append(String.valueOf(nowChar));
                            }
                            // 如果前一个字符是汉字
                            else if (Utils.chinese(suffStr)) {
                                sBuffer.append(splitStr + String.valueOf(nowChar));
                            }
                            // 如果前一字符是数字
                            else if (Utils.isNumeric(suffStr)) {
                                sBuffer.append(splitStr + String.valueOf(nowChar));
                            }
                            // 如果当前字符是汉字=========================
                        } else if (Utils.chinese(nowChar)) {
                            // 如果前一个字符是字母
                            if (Utils.word(suffStr)) {
                                sBuffer.append(splitStr + String.valueOf(nowChar));
                            }
                            // 如果前一个字符是汉字
                            else if (Utils.chinese(suffStr)) {
                                sBuffer.append(String.valueOf(nowChar));
                            }
                            // 如果前一字符是数字
                            else if (Utils.isNumeric(suffStr)) {
                                sBuffer.append(splitStr + String.valueOf(nowChar));
                            }
                        }
                    }
                    sBuffer.append("\n");
                    outPut = sBuffer.toString();
                }
                fw.write(outPut);
            } catch (IOException e) {
                e.printStackTrace();
            }
            try {
                read.close();
                fw.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        } else {
            System.out.println("第一个参数不是一个文件的绝对路径！！！");
        }
        long endTime = System.currentTimeMillis();
        System.out.println("输入文件：" + filePath);
        System.out.println("输出文件："+ newFile);
        System.out.println("耗时：" + (endTime - startTime) / 1000.00 + "秒");
    }
}

结果

输入文件：C:/Users/zhousi/Desktop/mobile_param_url.txt
输出文件：C:\Users\zhousi\Desktop\mobile_param_url.txt_new
耗时：0.261秒
（事例：）
三星@F@308
三星@SGH@208
三星@SCH@609

java对指定文件中汉字数字字母进行切分

初衷

最近项目组有个小需求：编辑组那边收集了一些电商产品信息，需要对其进行用指定分隔符连接，比如："三星GALAXY A4 移动4G" --> "三星@GALAXYA@4@移动@4@G"

代码实现(工具类)

代码实现(实现类)

结果

内容目录