import re
import os
# 按名称排序,获取目录下所有文件
def get_file_list(file_path):
dir_list = os.listdir(file_path)
if not dir_list:
return
else:
# 注意,这里使用lambda表达式,key设为下列数值可按照修改时间排序
# os.path.getmtime() 函数是获取文件最后修改时间
# os.path.getctime() 函数是获取文件最后创建时间
dir_list = sorted(dir_list, key=lambda x: int(x[:-4])) # 去掉.txt
return dir_list
def killAnUnseen(s):
try:
s.encode('gbk')
# print("return" + s)
return s
except UnicodeEncodeError as err:
mode = re.findall(r'position ([0-9]*): illegal multibyte sequence', str(err))
# if mode:
# print("position:" + mode[0])
# print(s[int(mode[0])])
news = s.replace(s[int(mode[0])], "", 1)
return killAnUnseen(news)
filelist = get_file_list('./data_old')
# filelist = filelist[60:]
for file in filelist:
print("\r {} is working".format(file), end="")
result = []
with open('./data_old/' + file, 'r') as f: # py3默认utf-8编码
lines = f.readlines()
for line in lines:
try: # 尝试gbk编码
line.encode('gbk')
except UnicodeEncodeError: # 编码失败,去掉无法编码的字符
print(line)
line = killAnUnseen(line)
result.append(line)
with open('./data/' + file, 'w', encoding='gbk') as f_new:
f_new.writelines(result)