texts = [] with open(file_path, " r " , encoding= " utf8 " ) as f: for data_line in f.readlines(): json_data = json.loads(data_line) file_name = json_data[ " file_name " ] file_data = json_data[ " datas " ] for k,v in file_data.items(): names.append(file_name) roles.append(k) texts.append(v) file_out = " ../datas/format/all_text.csv " dataframe = pd.DataFrame({ ' names ' : names, ' roles ' : roles, " texts " : texts}) dataframe.to_csv(file_out, index =False, sep= ' \t ' ) """ 从csv搜索数据 """ def search_text(key): file_out = " ../datas/classes/ " + key + " .csv " file_path = " ../datas/format/all_text.csv " data = pd.read_csv(file_path, sep= " \t " ) da = data[data[ " texts " ].str.contains(key)] da.to_csv(file_out, index =False, sep= ' \t ' ) """ 提取带有婚字的数据 """ def data_annotate(): file_in = " ../datas/format/primary.json " file_out = " ../datas/annotate/label.json " with open(file_out, " w " , encoding= " utf8 " ) as fo: with open(file_in, " r " , encoding= " utf8 " ) as f: for line in f.readlines(): item = {} label = 0 json_data = json.loads(line) for k,v in json_data[ " datas " ].items(): if " " in v: label = 1 if label == 1 : item[ " name " ] = json_data[ " file_name " ] item[ " label " ] = "" item[ " datas " ] = json_data[ " datas " ] fo.write(json.dumps(item, ensure_ascii =False) + " \n " ) return " success " """ 提取标注过的数据 """ def annotate(): file_in = " ../datas/annotate/label.json " file_labeled = " ../datas/annotate/labeled.json " file_unlabeled = " ../datas/annotate/unlabel.json " with open(file_in, " r " , encoding= " utf8 " ) as f_in: with open(file_labeled, " w " , encoding= " utf8 " ) as f_labeled: with open(file_unlabeled, " w " , encoding= " utf8 " ) as f_unlabeled: for line in f_in.readlines(): json_data = json.loads(line) if json_data[ " label " ]: f_labeled.write(json.dumps(json_data, ensure_ascii =False) + " \n " ) else : f_unlabeled.write(json.dumps(json_data, ensure_ascii =False) + " \n " ) return " success " def label_to_csv(): file_path = " ../datas/annotate/labeled.json " labels = [] datas = [] data_dict = [] with open(file_path, " r " , encoding= " utf8 " ) as f: for data_line in f.readlines(): json_data = json.loads(data_line) _label = json_data[ " label " ] _data = " | " .join(json_data[ " datas " ].values()) labels.append(_label) datas.append(_data) data_dict.append(data_line.replace( " \n " , "" )) file_out = " ../datas/annotate/labeled.csv " dataframe = pd.DataFrame({ ' labels ' : labels, ' datas ' : datas, " data_dict " : data_dict}) dataframe.to_csv(file_out, index =False, sep= ' \t ' ) """ 提取带工作的数据 """ def get_work(): search_text( " 工作 " ) if __name__ == ' __main__ ' : label_to_csv()