texts =
[]
with open(file_path,
"
r
"
, encoding=
"
utf8
"
) as f:
for
data_line
in
f.readlines():
json_data
=
json.loads(data_line)
file_name
= json_data[
"
file_name
"
]
file_data
= json_data[
"
datas
"
]
for
k,v
in
file_data.items():
names.append(file_name)
roles.append(k)
texts.append(v)
file_out
=
"
../datas/format/all_text.csv
"
dataframe
= pd.DataFrame({
'
names
'
: names,
'
roles
'
: roles,
"
texts
"
: texts})
dataframe.to_csv(file_out, index
=False, sep=
'
\t
'
)
"""
从csv搜索数据
"""
def
search_text(key):
file_out
=
"
../datas/classes/
"
+ key +
"
.csv
"
file_path
=
"
../datas/format/all_text.csv
"
data
= pd.read_csv(file_path, sep=
"
\t
"
)
da
= data[data[
"
texts
"
].str.contains(key)]
da.to_csv(file_out, index
=False, sep=
'
\t
'
)
"""
提取带有婚字的数据
"""
def
data_annotate():
file_in
=
"
../datas/format/primary.json
"
file_out
=
"
../datas/annotate/label.json
"
with open(file_out,
"
w
"
, encoding=
"
utf8
"
) as fo:
with open(file_in,
"
r
"
, encoding=
"
utf8
"
) as f:
for
line
in
f.readlines():
item
=
{}
label
=
0
json_data
=
json.loads(line)
for
k,v
in
json_data[
"
datas
"
].items():
if
"
婚
"
in
v:
label
= 1
if
label == 1
:
item[
"
name
"
] = json_data[
"
file_name
"
]
item[
"
label
"
] =
""
item[
"
datas
"
] = json_data[
"
datas
"
]
fo.write(json.dumps(item, ensure_ascii
=False) +
"
\n
"
)
return
"
success
"
"""
提取标注过的数据
"""
def
annotate():
file_in
=
"
../datas/annotate/label.json
"
file_labeled
=
"
../datas/annotate/labeled.json
"
file_unlabeled
=
"
../datas/annotate/unlabel.json
"
with open(file_in,
"
r
"
, encoding=
"
utf8
"
) as f_in:
with open(file_labeled,
"
w
"
, encoding=
"
utf8
"
) as f_labeled:
with open(file_unlabeled,
"
w
"
, encoding=
"
utf8
"
) as f_unlabeled:
for
line
in
f_in.readlines():
json_data
=
json.loads(line)
if
json_data[
"
label
"
]:
f_labeled.write(json.dumps(json_data, ensure_ascii
=False) +
"
\n
"
)
else
:
f_unlabeled.write(json.dumps(json_data, ensure_ascii
=False) +
"
\n
"
)
return
"
success
"
def
label_to_csv():
file_path
=
"
../datas/annotate/labeled.json
"
labels
=
[]
datas
=
[]
data_dict
=
[]
with open(file_path,
"
r
"
, encoding=
"
utf8
"
) as f:
for
data_line
in
f.readlines():
json_data
=
json.loads(data_line)
_label
= json_data[
"
label
"
]
_data
=
"
|
"
.join(json_data[
"
datas
"
].values())
labels.append(_label)
datas.append(_data)
data_dict.append(data_line.replace(
"
\n
"
,
""
))
file_out
=
"
../datas/annotate/labeled.csv
"
dataframe
= pd.DataFrame({
'
labels
'
: labels,
'
datas
'
: datas,
"
data_dict
"
: data_dict})
dataframe.to_csv(file_out, index
=False, sep=
'
\t
'
)
"""
提取带工作的数据
"""
def
get_work():
search_text(
"
工作
"
)
if
__name__
==
'
__main__
'
:
label_to_csv()