0650-6.2.0-通过UDF实现Hive&Impala的中文拼音排序开发者社区

0650-6.2.0-通过UDF实现Hive&Impala的中文拼音排序

select * from hanzi order by s2;

public String evaluate(String ChineseLanguage) {
    char[] cl_chars = ChineseLanguage.trim().toCharArray();
    String hanyupinyin = "";
    HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat();
    defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);// 输出拼音全部小写
    defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);// 不带声调
    defaultFormat.setVCharType(HanyuPinyinVCharType.WITH_V);
    try {
        for (int i = 0; i < cl_chars.length; i++) {
            if (String.valueOf(cl_chars[i]).matches("[\u4e00-\u9fa5]+")) {// 如果字符是中文,则将中文转为汉语拼音
                hanyupinyin += PinyinHelper.toHanyuPinyinStringArray(cl_chars[i], defaultFormat)[0];
            } else {// 如果字符不是中文,则不转换
                hanyupinyin += cl_chars[i];
    } catch (BadHanyuPinyinOutputFormatCombination e) {
        System.out.println("字符不能转成汉语拼音");
    return hanyupinyin;
}

<build>
    <plugins>
        <plugin>
            <artifactId>maven-assembly-plugin</artifactId>
            <configuration>
                <archive>
                    <manifest>
                        <mainClass>cn.com.gzcb.hive.udf.HanyuPinyinHelper</mainClass>
                    </manifest>
                </archive>
                <descriptorRefs>
                    <descriptorRef>jar-with-dependencies</descriptorRef>
                </descriptorRefs>
            </configuration>
        </plugin>
    </plugins>
</build>
<dependency>
        <groupId>org.apache.hive</groupId>
        <artifactId>hive-exec</artifactId>
        <version>2.1.1-cdh6.2.0</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/com.belerweb/pinyin4j -->