正则表达式语法
((?!{“ckResult”).)* :表示非字符串{"ckResult"开头的字符
spark.sql("""with tmp1 as (
select '{"checkNo":"123345666","checkTime":"2031.12.28","companyName":"上海公司","finalSummary":"阳性结果和异常情况\n【1】 您本次体检所检项目未见异常,建议您定期健康体检。\n","gender":"","kindList":[{"checkItemList":[{"ckResult":"157.5","itemName":"身高","unit":"cm"},{"ckResult":"53.0","itemName":"体重","unit":"Kg"},{"ckResult":"21.4","itemName":"体重指数","normalVal":"18.5--23.99"},{"ckResult":"105","itemName":"收缩压","normalVal":"90--139","unit":"mmHg"},{"ckResult":"71","itemName":"舒张压","normalVal":"60--89","unit":"mmHg"}],"doctorName":"刘洋","kindName":"一般项目检查","summary":"未见明显异常"},{"checkItemList":[{"ckResult":"无","itemName":"病史"},{"ckResult":"无特殊","itemName":"家族史"},{"ckResult":"68","itemName":"心率(次/分)","unit":"次/分"},{"ckResult":"齐","itemName":"心律"},{"ckResult":"正常","itemName":"心音"},{"ckResult":"双侧呼吸音未闻及异常","itemName":"肺部听诊"},{"ckResult":"肝脏肋下未触及","itemName":"肝脏触诊"},{"ckResult":"脾脏肋下未触及","itemName":"脾脏触诊"},{"ckResult":"双肾区无叩痛","itemName":"肾脏叩诊"},{"ckResult":"无","itemName":"内科其它"}],"doctorName":"周芷萍","kindName":"内科","summary":"未见明显异常女外科\n"},{"checkItemList":[{"ckResult":"5.6","itemName":"白细胞计数","normalVal":"3.5--9.5","shortCode":"WBC","unit":"10^9/L"},{"ckResult":"4.35","itemName":"红细胞计数","normalVal":"3.8--5.1","shortCode":"RBC","unit":"10^12/L"},{"ckResult":"136.0","itemName":"血红蛋白","normalVal":"115--150","shortCode":"Hb","unit":"g/L"},{"ckResult":"0.41","itemName":"红细胞压积","normalVal":"0.35--0.45","shortCode":"HCT","unit":"L/L"},{"ckResult":"93.6","itemName":"平均红细胞体积","normalVal":"82--100","shortCode":"MCV","unit":"fL"},{"ckResult":"31.3","itemName":"平均红细胞血红蛋白含量","normalVal":"27--34","shortCode":"MCH","unit":"pg"},{"ckResult":"334.0","itemName":"平均红细胞血红蛋白浓度","normalVal":"316--354","shortCode":"MCHC","unit":"g/L"},{"ckResult":"12.0","itemName":"RDW-CV","normalVal":"10.9--15.4"},{"ckResult":"287.0","itemName":"血小板计数","normalVal":"125--350","shortCode":"PLT","unit":"10^9/L"},{"ckResult":"9.7","itemName":"平均血小板体积","normalVal":"7.6--13.6","shortCode":"MPV","unit":"fL"},{"ckResult":"11.6","itemName":"血小板分布宽度","normalVal":"9.0--17","shortCode":"PDW"},{"ckResult":"29.1","itemName":"淋巴细胞百分比","normalVal":"20--50%","shortCode":"LYMPH%"},{"ckResult":"5.3","itemName":"中间细胞百分比","normalVal":"3--10%","shortCode":"MON%"},{"ckResult":"65.6","itemName":"中性粒细胞百分比","normalVal":"40--75%","shortCode":"NEUT%"},{"ckResult":"1.6","itemName":"淋巴细胞绝对值","normalVal":"1.1--3.2","shortCode":"LYMPH","unit":"10^9/L"},{"ckResult":"0.3","itemName":"中间细胞绝对值","normalVal":"0.12--1.0","unit":"10^9/L"},{"ckResult":"3.7","itemName":"中性粒细胞绝对值","normalVal":"1.8--6.3","shortCode":"NEUT","unit":"10^9/L"},{"ckResult":"41.0","itemName":"红细胞分布宽度-标准差","normalVal":"39--53.9","shortCode":"RDW-SD","unit":"g/L"},{"ckResult":"0.235","itemName":"血小板压积","normalVal":"0.108--0.282%","shortCode":"PCT"}],"doctorName":"韩秀杰","kindName":"血常规","summary":"未见明显异常"},{"checkItemList":[{"ckResult":"1.020","itemName":"尿比重","normalVal":"1.010--1.025","shortCode":"SG"},{"ckResult":"6.0","itemName":"尿酸碱度","normalVal":"5.4--8.4","shortCode":"PH"},{"ckResult":"阴性","itemName":"尿白细胞","normalVal":"阴性","shortCode":"LEU","unit":"Cell/uL"},{"ckResult":"阴性","itemName":"尿亚硝酸盐","normalVal":"阴性","shortCode":"NIT"},{"ckResult":"阴性","itemName":"尿蛋白质","normalVal":"阴性","shortCode":"PRO","unit":"g/L"},{"ckResult":"阴性","itemName":"尿糖","normalVal":"阴性","shortCode":"GLU","unit":"mmol/L"},{"ckResult":"阴性","itemName":"尿酮体","normalVal":"阴性","shortCode":"KET","unit":"mmol/L"},{"ckResult":"阴性","itemName":"尿胆原","normalVal":"阴性","shortCode":"URO","unit":"umol/L"},{"ckResult":"阴性","itemName":"尿胆红素","normalVal":"阴性","shortCode":"BIL","unit":"umol/L"},{"ckResult":"阴性","itemName":"尿隐血","normalVal":"阴性","shortCode":"BLD","unit":"Cell/uL"}],"doctorName":"韩秀杰","kindName":"尿常规","summary":"未见明显异常"},{"checkItemList":[{"ckResult":"9","itemName":"丙氨酸氨基转移酶","normalVal":"0.00--40","shortCode":"ALT","unit":"U/L"},{"ckResult":"16","itemName":"天门冬氨酸氨基转移酶","normalVal":"0--40","shortCode":"AST","unit":"U/L"},{"ckResult":"4.28","itemName":"尿素","normalVal":"1.7--8.3","shortCode":"UREA","unit":"mmol/L"},{"ckResult":"4.72","itemName":"空腹血葡萄糖","normalVal":"3.90--6.10","shortCode":"FBG","unit":"mmol/L"},{"ckResult":"5.13","itemName":"总胆固醇","normalVal":"3.10--5.69","shortCode":"TC","unit":"mmol/L"},{"ckResult":"0.59","itemName":"甘油三酯","normalVal":"0.4--1.53","shortCode":"TG","unit":"mmol/L"},{"ckResult":"7.69","itemName":"甲胎蛋白定量","normalVal":"0--20","shortCode":"AFP","unit":"ng/ml"},{"ckResult":"1.10","itemName":"癌胚抗原定量","normalVal":"0--5","shortCode":"CEA","unit":"ng/ml"}],"doctorName":"韩秀杰、赵洁","kindName":"实验室检查","summary":"未见明显异常"},{"checkItemList":[{"ckResult":"未见明显异常","itemName":"肝"},{"ckResult":"未见明显异常","itemName":"胆"},{"ckResult":"未见明显异常","itemName":"胰"},{"ckResult":"未见明显异常","itemName":"脾"},{"ckResult":"未见明显异常","itemName":"双肾"}],"doctorName":"蒋海燕","kindName":"超声检查室","summary":"未见明显异常心电图\n"}],"medicalBranchOrg":"爱康国宾","medicalOrgName":"爱康国宾","name":"徐晓莉"}
text_info
select regexp_extract(b.text_info,'"ckResult":"([^"]*)"([^}]*)"itemName":"(中性粒细胞数|中性粒细胞值|中性粒细胞绝对值|中性粒细胞计数|嗜中性粒细胞绝对值)([^"]*)"([^}]*)}',1) na --中性粒细胞绝对值
,regexp_extract(b.text_info,'"ckResult":"([^"]*)"([^}]*)"itemName":"红细胞计数"([^}]*)}',1) rbc --红细胞计数
,regexp_extract(b.text_info,'"ckResult":"([^"]*)"([^}]*)"itemName":"血红蛋白"([^}]*)}',1) hb --血红蛋白
,regexp_extract(b.text_info,'"ckResult":"([^"]*)"([^}]*)"itemName":"血小板计数"([^}]*)}',1) plat --血小板
,regexp_extract(b.text_info,'"ckResult":"([^"]*)"([^}]*)"itemName":"尿隐血"([^}]*)}',1) ubc --红细胞镜检(尿隐血)
,regexp_extract(b.text_info,'"ckResult":"([^"]*)"([^}]*)"itemName":"尿蛋白质"([^}]*)}',1) tup --尿蛋白
,regexp_extract(b.text_info,'"ckResult":"([^"]*)"([^}]*)"itemName":"(尿液管型|尿管型)([^"]*)"([^}]*)}',1) bc --尿管型
,regexp_extract(b.text_info,'"kindName":"宫颈刮片([^"]*)"([^}]*)"summary":"([^"]*)"([^}]*)}',3) css --宫颈刮片
,regexp_extract(b.text_info,'"kindName":"内科([^"]*)"([^}]*)"summary":"([^"]*)"([^}]*)}',3) iea --内科检查异常+B39:E48
,regexp_extract(b.text_info,'"kindName":"外科([^"]*)"([^}]*)"summary":"([^"]*)"([^}]*)}',3) sea --外科检查异常
from tmp1 b
""").show()
- lateral view explode
spark.sql("""
with tmp1 as (
select 1 id,78 score_english, 88 score_math, 97 score_chinese
union all
select 2 id,48 score_english, 83 score_math, 67 score_chinese
union all
select 3 id,58 score_english, 89 score_math, 85 score_chinese
select id,adid,ad from tmp1
lateral view explode(map('english',score_english,'math',score_math,'chinese',score_chinese)) adtable as adid,ad
""").show
spark.sql("""
with tmp1 as (
select 1 id
select id,t1 from tmp1
lateral view explode(array(1,2,3)) t as t1
""").show
select
diagnose_code
,diagnose
,accident_ind
,disease_ind
,risk_type
,adid
from icd10_standard_disease_info_mapping
-- 列转行
lateral view explode (map(
'D001',D001,'D002',D002,'D003',D003,'D004',D004,'D005',D005,'D006',D006,'D007',D007,'D008',D008,'D009',D009,'D010',D010
,'D011',D011,'D012',D012,'D013',D013,'D014',D014,'D015',D015,'D016',D016,'D017',D017,'D018',D018,'D019',D019,'D020',D020
,'D021',D021,'D022',D022,'D023',D023,'D024',D024,'D025',D025,'D026',D026,'D027',D027,'D028',D028,'D029',D029,'D030',D030
,'D031',D031,'D032',D032,'D033',D033,'D034',D034,'D035',D035,'D036',D036,'D037',D037,'D038',D038,'D039',D039,'D040',D040
,'D041',D041,'D042',D042,'D043',D043,'D044',D044,'D045',D045,'D046',D046,'D047',D047,'D048',D048,'D049',D049,'D050',D050
,'D051',D051,'D052',D052,'D053',D053,'D054',D054,'D055',D055,'D056',D056,'D057',D057,'D058',D058,'D059',D059,'D060',D060
,'D061',D061,'D062',D062,'D063',D063,'D064',D064,'D065',D065,'D066',D066,'D067',D067,'D068',D068,'D069',D069,'D070',D070
,'D071',D071,'D072',D072,'D073',D073,'D074',D074,'D075',D075,'D076',D076,'D077',D077,'D078',D078,'D079',D079,'D080',D080
,'D081',D081,'D082',D082,'D083',D083,'D084',D084,'D085',D085,'D086',D086,'D087',D087,'D088',D088,'D089',D089,'D090',D090
,'D091',D091,'D092',D092,'D093',D093,'D094',D094,'D095',D095,'D096',D096,'D097',D097,'D098',D098,'D099',D099,'D100',D100
,'M001',M001,'M002',M002,'M003',M003,'M004',M004,'M005',M005,'M006',M006,'M007',M007,'M008',M008,'M009',M009,'M010',M010
,'M011',M011,'M012',M012,'M013',M013,'M014',M014,'M015',M015,'M016',M016,'M017',M017,'M018',M018,'M019',M019,'M020',M020
,'M021',M021,'M022',M022,'M023',M023,'M024',M024)) adTable AS adid,ad
where pt = '${bizdate}000000'
and ad=1
-
查看Parquet 表时提示找不到Parquet文件
验证:1.insert…select 语句中是否有重名列;
2.write parquet 前打印schema与目标表结构比对
3.目标表的路径需要删除重新建立
hadoop dfs -rm -r /hdfs/table
hadoop dfs -mkdir /hdfs/table
-
设置自增序列
row_number() over():什么参数都不加则生成一个从1开始的自增字段
insert into table User_Attribute select row_number() over() as id,customid from tbl_custom;
若下次操作需要在上次自增值的基础上继续自增,实现序列的效果,可以先计算出原有的number: max(id),join on (1=1)关联上去
insert into table User_Attribute
select row_number() over() + id_max as id,customid
from tbl_custom a
join (select max(id) id_max from User_Attribute) b on (1 = 1);
- lag (col ignore nulls ) —> 替换为 last(expr[, isIgnoreNull]) - Returns the last value of expr for a group of rows. If isIgnoreNull is true, returns only non-null values
属于Spark built-in function, hive中不支持,Spark SQL 可以
last 会考虑当前行,使用时结合rows between 限定范围
with tmp1 as (
select 'hjl' name,'a' level,1 id
union all
select 'hjl' name,'' level,2 id
union all
select 'hjl' name,null level,3 id
union all
select 'hjl' name,'b' level,4 id
union all
select 'hjl' name,'c' level,5 id
union all
select 'hjl' name,null level,6 id
union all
select 'hjl' name,'d' level,7 id
union all
select 'hjl' name,'' level,8 id
select a.*
,last(case when level = '' then null else level end,true) over(partition by name order by id rows between unbounded preceding and 1 preceding) last_level
from tmp1 a
spark-submit --queue $1 --num-executors $2 --executor-cores $3 --executor-memory $4 --driver-memory 15G --driver-cores 24 --master yarn --deploy-mode $5 --class $6 $7 $8 $9
num-executors * executor-memory 过大的话,会直接无法提交程序
spark2.4scala2.11准备环境准备数据//造数据需求//需求结果// all_v_list 不需要排序 去重即可自定义函数// 聚合函数输入数据结构// 缓存区数据结构,用于计算// 聚合函数输出值数据结构// 聚合函数是否是幂等的,即相同输入是否总是能得到相同输出// 初始化缓冲区}// 给聚合函数传入一条新数据进行处理}else{}}// 合并聚合函数缓冲区(分布式)}// 计算最终返回结果list}}注册函数//注册函数测试"""|select。
1、intersect运算符intersect运算符通过只包括TABLE1和TABLE2中都有的行并消除所有重复行而派生出一个结果表。当ALL随INTERSECT一起使用时(intersectall),不消除重复行。2、minus运算符minus运算符通过只包括TABLE1和TABLE2中都有的行并消除所有重复行而派生出一个结果表。当all随minus一...