数据处理| 自己的数据集json文件转txt文件（YOLO所需文件）_json转txt

实现代码

import json
import os
from pathlib import Path # 递归方式
def json2txt(path_json, path_txt):
    with open(path_json, 'r', encoding='gb18030') as path_json:
        jsonx = json.load(path_json)
        filename = path_txt.split(os.sep)[-1]
        with open(path_txt, 'w+') as ftxt:
            for shape in jsonx:
                xy = shape["Point(x,y)"]
                label = shape["Label"]
                w = shape["W"]
                h = shape["H"]
                strxy = ' '
                ftxt.writelines(str(label) + strxy + str(xy) + strxy + str(w) + strxy + str(h) + "\n")
# dir_json = r'E:\Acryl\datapro\jsonfileall\**\**.json'
jpath = Path('/workspace/yolo/data/dataset/f001json/')
dir_txt = '/workspace/yolo/data/dataset/f5/txtfile/'
if not os.path.exists(dir_txt):
    os.makedirs(dir_txt)
# list_json = os.listdir(str('jsonfileall'))
# print(type(list_json))
# for cnt, json_name in enumerate(list_json):
#     print(cnt, json_name)
# 这个方法的作用是将Path对象下的符合pattern（正则表达式、通配符）的所有文件获取到，返回一个生成器，可以通过for遍历或者next去获取详细的文件位置
for p in jpath.rglob("*.json"):
 # print('%s :/ "%s"' % (("file" if os.path.isfile(str(p)) else "dir "), str(p)))
#print(p)
# # 这里p不迭代
# # 找到所有带有路径的json文件
# list_json = os.listdir(str(p))
 # # with open(list_json, 'r', encoding='utf-8') as path_json:
# #     jsonx = json.load(list_json)
# for cnt, json_name in enumerate(list_json):
#     print('cnt=%d,name=%s' % (cnt, json_name))
#     # path_json = dir_json+json_name
#     path_json = jpath + json_name
#     path_txt = dir_txt + json_name.replace('.json', '.txt')
#     print(path_json, path_txt)
#     json2txt(path_json, path_txt)
    path_txt = dir_txt + os.sep + str(p).split(os.sep)[-1].replace('.json', '.txt')
# print(p, path_txt)
    json2txt(p, path_txt)

数据预处理扩展

1.使用pandas库读取文件单一json文件代码（可选择）

#将JSON文件读取为Pandas类型
import pandas as pd
#单一json文件测试
df = pd.read_json('E:\datapro\d1.json')

2.测试单个json文件提取所需要的信息，并写入txt文件（可选择）

import json
person_dict = {}
final_dict = "E:\Acryl\datapro\labels\"
#import json file and load the content in person_dict as a dictionary
with open('E:\Acryl\datapro\d1.json') as f:
    person_dict=json.load(f)
    #get the first list from the json(person_dict)
    for data in person_dict:
            final_dict = final_dict + ("Label: " + data[str('Label')] + "\n")
            final_dict = final_dict + ("Point(x,y):" + data[str('Point(x,y)')] + "\n")
            final_dict = final_dict + ("W: " + data[str('W')] + "\n")
            final_dict = final_dict + ("H: " + data[str('H')] + "\n")
        except:
text_file = open("json_list.txt", "w") # write output as a txt
n = text_file.write(final_dict)
text_file.close()

#import json file and load the content in person_dict as a dictionary with open('E:\Acryl\datapro\d1.json') as f: person_dict=json.load(f) #get the first list from the json(person_dict) for data in person_dict: final_dict =data[str('Label')] +' '+ data[str('Point(x,y)')] +' '+ data[str('W')]+' '+data[str('H')] except: text_file = open("json_list.txt", "w") # write output as a txt n = text_file.write(final_dict) text_file.close() for json_id in fileList: print(json_id) with open('E:/Acryl/datapro/jsonfile/' + json_id, "r", encoding="utf-8") as f: # 打开文件 data = f.read() # 读取文件 #print(data)测试打印所有数据 """首先根据传入的主目录路径，得到里面的子文件夹路径，其中每个子文件夹里面分别保存着若干jpg图片和一个json文件""" def get_dirs(main_dir): list_dirs = [] for root, dirs, files in os.walk(main_dir): for dir in dirs: list_dirs.append(os.path.join(root, dir)) return list_dirs """每个文件夹下面包含若干张jpg图片和一个json文件，根据传入的文件夹路径，得到该文件夹下的所有的jpg文件和json文件。(包含路径)""" def get_file(dir_path): list_jpgs = [] for root, dirs, files in os.walk(dir_path): for file in files: if file.endswith(".jpg"): #过滤得到jpg文件， #print(os.path.join(root, file)) list_jpgs.append(os.path.join(root, file)) if file.endswith(".json"): #过滤得到json文件 json_path = os.path.join(root, file) #json文件只有一个，就不用列表了 return list_jpgs, json_path #得到所有的jpg文件和json文件的列表(包含路径) 从json文件中获取到坐标信息，打开并加载完json文件之后，开始解析json内容， json中的object对应着python中的字典， json中的array对应着python中的列表，然后无非就是他们的嵌套，一个一个挨着解析就好了。 def get_coordinate(json_path): coordinates = [] with open(json_path, 'rb') as file_json: datas = json.load(file_json) #print(datas['shapes']) #datas的shapes元素是一个列表 for list in datas['shapes']:#逐个遍历datas['shapes']列表中的每个元素,其中每个元素又是一个字典 #print(list['points']) #list字典中的points对应的values值就是坐标信息，而该坐标信息又是两个列表， coordinates.append(list['points']) return coordinates """根据文件夹下面的json文件里面的坐标信息，从文件夹下面的jpg图片截取子图""" if __name__ == '__main__': main_dir = r"F:\tubiao" i = 0 dirs = get_dirs(main_dir)#这一步是得到文件夹下的所有文件夹路径，其中每个文件夹下面又包含若干照片和一个json文件。 for dir in dirs:#针对每个子文件夹里面的图片和json分别进行处理 print(dir) j = 0 #每个文件夹里面的截取的子图保存时从0开始。 list_jpgs, json_path = get_file(dir)#这一步是得到每个子文件夹里面的jpg图片路径名字和json路径名字 coordinates = get_coordinate(json_path)#这一步是根据json文件路径得到接送里面保存的坐标信息， for list_jpg in list_jpgs:#对每个图片进行截图， for coordinate in coordinates:#根据坐标信息截图，有几个坐标信息就截几个图片 #image = cv2.imread(list_jpg) #不能读取中文路径，用imdecode代替 #print(list_jpg) image = cv2.imdecode(np.fromfile(list_jpg, dtype=np.uint8), cv2.IMREAD_COLOR) #image = cv2.cvtColor(image_temp, cv2.COLOR_RGB2BGR) 这个不能加，加上之后截出来的子图保存后颜色都变了。 x1 = int(coordinate[0][0]) #左上角的顶点X y1 = int(coordinate[0][1]) #左上角的顶点Y x2 = int(coordinate[1][0]) #右下角的顶点X y2 = int(coordinate[1][1]) #右下角的顶点Y cropImg = image[y1:y2, x1:x2] #坐标顺序是Y1:Y2, X1:X2，Y在前，X在后。 save_name = str(i) + "_cut" + str(j) + ".jpg" #因为是截子图，所以加了个cut save_path = os.path.join(dir,save_name) j = j + 1 #print(save_path) # cv2.imwrite(save_path, frame) #保存路径中包含中文，不能用imwrite保存，要用下一行的imencode的方法。 ret = cv2.imencode('.jpg', cropImg)[1].tofile(save_path) # [1]表示imencode的第二个返回值，也就是这张图片对应的内存数据 i = i + 1#保证每个文件夹里面截取子图的时候命名不重复

7.python遍历目录下的所有目录和文件, python读取json文件信息, 生成一个list.txt（可选择）

import json import os def readjson(): path = 'E:/Acryl/datapro/jsonfileall/jsonfile/' # 包含所有json文件夹位置 files = os.listdir(path) label_txt = open('E:/Acryl/datapro/jsonfileall/label_txt.txt', mode='w') for file in files: f = open(path + '\\' + file, mode='r', encoding='utf-8') temp = json.loads(f.read()) for temp in temp: json_str = temp[str('Label')] + ' ' + temp[str('Point(x,y)')] + ' ' + temp[str('W')] + ' ' + temp[str('H')] except: json_str = temp["Label"] + ' ' + temp[str("Point(x,y)")]+' '+ temp[str('W')]+' '+temp[str('H')] # 写入txt文件 label_txt.writelines(json_str + '\n') print(json_str) # 打印提取的数据 label_txt.close() if __name__ == '__main__': readjson()

测试数据为三个json文件，json文件同上

if os.path.exists(filename): # 计算 yolo 数据格式所需要的中心点的相对 x, y 坐标, w,h 的值 x = (t['bbox'][0] + t['bbox'][2]) / 2 / t['image_width'] y = (t['bbox'][1] + t['bbox'][3]) / 2 / t['image_height'] w = (t['bbox'][2] - t['bbox'][0]) / t['image_width'] h = (t['bbox'][3] - t['bbox'][1]) / t['image_height'] fp = open(filename, mode="r+", encoding="utf-8") file_str = str(t['category']) + ' ' + str(round(x, 6)) + ' ' + str(round(y, 6)) + ' ' + str(round(w, 6)) + \ ' ' + str(round(h, 6)) line_data = fp.readlines() if len(line_data) != 0: fp.write('\n' + file_str) else: fp.write(file_str) fp.close() # 不存在则创建文件 else: fp = open(filename, mode="w", encoding="utf-8") fp.close() if __name__ == '__main__': main()

9.自己数据集的批量读取json文件，读取必要信息转为对应的txt文件（YOLO需要的格式）

import os
import numpy as np
import json
def json2txt(path_json, path_txt):
    with open(path_json, 'r', encoding='gb18030') as path_json:
        jsonx = json.load(path_json)
        with open(path_txt, 'w+') as ftxt:
            for shape in jsonx:
                xy = shape["Point(x,y)"]
                label = shape["Label"]
                w = shape["W"]
                h = shape["H"]
                strxy = ' '
                ftxt.writelines(str(label)+ strxy + str(xy) + strxy + str(w)+ strxy + str(h)+"\n")
dir_json = 'E:/Acryl/datapro/jsonfileall/jsonfile/'
dir_txt = 'E:/Acryl/datapro/labels/'
if not os.path.exists(dir_txt):
    os.makedirs(dir_txt)
list_json = os.listdir(dir_json)
for cnt, json_name in enumerate(list_json):
    print('cnt=%d,name=%s' % (cnt, json_name))
    path_json = dir_json + json_name
    path_txt = dir_txt + json_name.replace('.json', '.txt')
    print(path_json, path_txt)
    json2txt(path_json, path_txt)

def file_name(file_dir): for root, dirs, files in os.walk(file_dir): file = open('labellist.txt', 'w+') for f in files: # print(os.path.join(path1,f)) i = (os.path.join(path1, f)) file.write( i + '\n') file.close() if __name__ == '__main__': file_name('/workspace/yolo/data/dataset/f001json')

labellist.txt

12.批量修改处理txt文件内容

批量修改txt文件，在做YOLO项目时，会需要将文本文件中的某部分内容进行批量替换和修改，所以编写了python程序批量替换所有文本文件中特定部分的内容。

import re
import os
# 38 labels  
def reset():
  path='/workspace/yolo/data/dataset/test/'
  #path = "/workspace/yolo/data/dataset/labels0208/"
  filelist = os.listdir(path)  # 该文件夹下所有文件（包括文件夹）
  for files in filelist: # 遍历所有文件
    i = i + 1
    Olddir = os.path.join(path,files); # 原来的文件路径
    if os.path.isdir(Olddir):
      continue;
    filename = os.path.splitext(files)[0];
    filetype = os.path.splitext(files)[1];
    filePath = path+filename+filetype
    #这里会把所有0->1 
    alter(filePath,"0","1")
def alter(file,old_str,new_str):
  with open(file,"r",encoding="utf-8") as f1,open("%s.bak"% file,"w",encoding="utf-8") as f2:
    for line in f1:
      if old_str in line:
        line = line.replace(old_str,new_str)
      f2.write(line)
  os.remove(file)
  os.rename("%s.bak" % file,file)
reset()

Created on 2023.02 @author: Elena # coding=utf-8 import os path = path='/workspace/yolo/data/dataset/test' def listfiles(dirpath): filelist = [] for root, dirs, files in os.walk(dirpath): for fileObj in files: filelist.append(os.path.join(root, fileObj)) return filelist def txt_modify(files): for file in files: label_path = os.path.join(path, file) with open(label_path, 'r+') as f: lines = f.readlines() for line in lines: # e.g.'Bus' is the new one, Truck is the old one f.seek(0) f.truncate() f.write(line.replace('Truck', 'Bus')) # f.write(line.replace('dog', 'cat').replace('man', 'boy')) # 多个内容的替换 f.close() def main(): filelist = listfiles(path) for fileobj in filelist: f = open(fileobj, 'r+') lines = f.readlines() f.seek(0) f.truncate() for line in lines: f.write(line.replace('2 ', '3 ')) f.close() if __name__ == main(): main()

方法3：处理yolo已经转换好的标签数修改

import os
import re
#path = '/workspace/yolo/data/dataset/testjson2txt/1/' 
path='/workspace/yolo/data/dataset/labels0208/'
files = []
for file in os.listdir(path):
    if file.endswith(".txt"):
        files.append(path+file)
for file in files:
    with open(file, 'r') as f:
        new_data = re.sub('^12', '1', f.read(), flags=re.MULTILINE)    # 将列中的12替换为1
        print("Down")
    with open(file, 'w') as f:
        f.write(new_data)

第四篇—json标签(labelme)转txt标签（YOLOv5专题） - 哔哩哔哩 (bilibili.com)

YOLOV5——将 json 格式的标注数据转化为 YOLO 需要的 txt 格式 - yx啦啦啦 - 博客园 (cnblogs.com)

1、 YOLO 动物目标检测 数据集 ，真实场景的高质量图片数据，数据场景丰富，图片格式为jpg，共16100张图片，分为训练集和验证集。 2、使用lableimg标注软件标注，标注框质量高，标签格式为 json 格式，可以直接用于 YOLO 系列的目标检测。 3、 数据集 详情展示和更多 数据集 下载：https://blog.csdn.net/m0_64879847/article/details/132301975 1）文件夹目录下有 json 格式数据和jpg数据，读取 json 数据时会报错解决方法：增加 json 文件后缀判断 for json _name in json _names: if json _name[-4:]==' json ': 2）pandas数据用append增加行无效解决方法：需要重新对pd进行赋值更新 label=label.append(new_label,ignore_index=True) 3） json 数据坐标存在缺失，本应有四个值

在参加许多目标检测比赛时，为了能够获得合理的评价结果，官方往往是将已经打好标签的 数据集 事先划分好训练集与测试集，将训练集和测试集的标签分别存放在 json 文件。以百度飞桨平台第17届全国大学生智能汽车竞赛百度创意组 数据集 为例，我们将学习如何将 json 格式的 数据集 标签转化为有效的 txt 文件。分析 json 格式标签了解 json 文件格式，详细请参考这篇博客： Json

好的，您想将大量的 JSON 文件转换成 TXT 格式，以便于使用 YOLO 目标检测算法进行训练和测试。我可以提供以下几种方法： 1. 使用Python编写脚本，将 JSON 文件中的数据解析出来，并转换为 YOLO 所需的 TXT 格式。您可以使用Python的 json 库读取 JSON 文件，然后将其转换为 YOLO 格式的 TXT 文件。 2. 使用在线转换工具，将 JSON 文件上传到该工具中，然后选择将其转换为 YOLO 格式的 TXT 文件。您可以搜索“ JSON to YOLO txt converter”来寻找在线转换工具。 3. 使用专业的 数据处理 软件，如LabelImg或LabelBox，将 JSON 文件打开，然后将其转换为 YOLO 格式的 TXT 文件。这些软件都提供了将 JSON 文件转换为 TXT 文件的功能。请注意，转换 JSON 文件到 TXT 格式时，需要保证数据格式的正确性和完整性，以避免在训练和测试时出现错误和偏差。