train_json_files = glob(paths.TRAIN_JSON_FOLDER + "*.json")
from pathlib import Path
def get_gt_string_and_xy(filepath: Union[str, os.PathLike]) -> Dict[str, str]:
"""
Get the ground truth string and x-y data from the given JSON file.
:param filepath: The path to the JSON file
:return dict: A dictionary containing the ground truth string, x-y data, chart type, id, and source
"""
filepath = Path(filepath)
with open(filepath) as fp:
data = json.load(fp)
all_x, all_y = process_data_series(data.get("data-series", []))
chart_type = data.get('chart-type', '')
chart_str = create_chart_string(chart_type)
x_str = create_coordinate_string("x", all_x)
y_str = create_coordinate_string("y", all_y)
gt_string = chart_str + x_str + y_str
return {
"ground_truth": gt_string,
"x": json.dumps(all_x),
"y": json.dumps(all_y),
"chart-type": chart_type,
"id": filepath.stem,
"source": data.get("source", ''),
}
def gen_data(files: List[str], paths:paths, get_gt_string_and_xy:callable ) -> Dict[str, str]:
"""
This function takes a list of json files and returns a generator that yields a
dictionary with the ground truth string and the path to the image.
:param files (list): A list of json files
:return generator: A generator that yields a dictionary with the ground truth string and the path to the corresponding image.
"""
for f in files:
# Extract image ID from the file path
image_id = f.split("/")[-1].split(".")[0]
# Construct the image path based on the ID
image_path = paths.TRAIN_IMAGES_FOLDER + image_id + ".jpg"
# Yield a dictionary containing ground truth string, image path, and other information
yield {
**get_gt_string_and_xy(f),
"image_path": image_path,
}
ds = HFDataset.from_generator(
gen_data, gen_kwargs={"files": train_json_files,"paths":paths,"get_gt_string_and_xy":get_gt_string_and_xy}, num_proc=config.NUM_PROCESS
)
print(f"Ground Truth string: \n {ds['ground_truth'][0]}")
I have this function to create a generator of my data, there is a function that uses here which is defined outside this function, I first have an error that this function and this class are not defined for me to Pass the function and class path as variable but now I have this Path and JSON are not defined despite the fact that I have already imported this library in my notebook
i can use from pathlib import Path inside function but i need to do that for all library that i want to use
---------------------------------------------------------------------------
RemoteTraceback Traceback (most recent call last)
RemoteTraceback:
"""
Traceback (most recent call last):
File "c:\Users\FR00CSS0000000040678\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\builder.py", line 1726, in _prepare_split_single
for key, record in generator:
File "c:\Users\FR00CSS0000000040678\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\packaged_modules\generator\generator.py", line 30, in _generate_examples
for idx, ex in enumerate(self.config.generator(**gen_kwargs)):
File "C:\Users\FR00CSS0000000040678\AppData\Local\Temp\ipykernel_22828\3924248845.py", line 18, in gen_data
File "C:\Users\FR00CSS0000000040678\AppData\Local\Temp\ipykernel_22828\332669057.py", line 28, in get_gt_string_and_xy
NameError: name 'Path' is not defined
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "c:\Users\FR00CSS0000000040678\AppData\Local\Programs\Python\Python311\Lib\site-packages\multiprocess\pool.py", line 125, in worker
result = (True, func(*args, **kwds))
^^^^^^^^^^^^^^^^^^^
File "c:\Users\FR00CSS0000000040678\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\utils\py_utils.py", line 614, in _write_generator_to_queue
for i, result in enumerate(func(**kwargs)):
File "c:\Users\FR00CSS0000000040678\AppData\Local\Programs\Python\Python311\Lib\site-packages\datasets\builder.py", line 1762, in _prepare_split_single
raise DatasetGenerationError("An error occurred while generating the dataset") from e
datasets.exceptions.DatasetGenerationError: An error occurred while generating the dataset
"""
...
772 return self._value
773 else:
--> 774 raise self._value
DatasetGenerationError: An error occurred while generating the dataset
class generate_data(config):
def __init__(self):
self.TRAIN_FOLDER = "./train"
self.TRAIN_IMAGES_FOLDER = "./train/images/"
self.TRAIN_JSON_FOLDER = "./train/annotations/"
self.X_START = "<s_x_values>"
self.X_END = "</s_x_values>"
self.Y_START = "<s_y_values>"
self.Y_END = "</s_y_values>"
self.CHART_START = "<s_chart>"
self.CHART_END = "</s_chart>"
self.added_tokens = [self.X_START, self.X_END, self.Y_START, self.Y_END, self.CHART_START, self.CHART_END]
def custom_round(self,value: Union[int, float, str]) -> Union[str, float]:
"""
Convert a float value to a string with custom decimal truncation rules.
If the absolute value of the integer part is greater than 1, truncate to 1 decimal.
Otherwise, truncate to 4 decimals.
Args:
value (int, float, str): The float value to convert
Returns:
Union[str, float]: The rounded float value as a string or float
"""
if isinstance(value, (int, float)):
str_value = str(value)
if "." in str_value:
integer_part, decimal_part = str_value.split(".")
decimal_limit = 1 if abs(float(integer_part)) > 1 else 4
truncated_decimal = decimal_part[:decimal_limit]
return float(f"{integer_part}.{truncated_decimal}")
return value
def is_not_a_number(self,value: Union[int, float, str]) -> bool:
"""
Check if a value is not a number (NaN).
Args:
value (int, float, str): The value to check
Returns:
bool: True if the value is NaN, False otherwise
"""
return isinstance(value, float) and str(value).lower() == "nan"
class RollingAverageMeter:
"""Computes and stores a rolling average and current value"""
def __init__(self):
self.reset()
def reset(self):
"""
Reset all values to their initial state.
"""
self.current_value = 0
self.rolling_average = 0
self.sum = 0
self.count = 0
def update(self, value, weight=1):
"""
Update values based on new data.
Args:
value: The new value to update
weight: Weight associated with the new value
"""
self.current_value = value
self.sum += value * weight
self.count += weight
self.rolling_average = self.sum / self.count
def process_data_series(self,data_series):
all_x, all_y = [], []
for d in data_series:
x = self.custom_round(d["x"])
y = self.custom_round(d["y"])
# Ignore nan values
try:
if self.is_not_a_number(x) or self.is_not_a_number(y):
continue
except:
raise Exception(x,y)
all_x.append(x)
all_y.append(y)
return all_x, all_y
def create_chart_string(self,chart_type):
return self.CHART_START + chart_type + self.CHART_END
def create_coordinate_string(self,label, values):
return f"<s_{label}_values>" + ";".join(map(str, values)) + f"</s_{label}_values>"
def get_gt_string_and_xy(self,filepath: Union[str, os.PathLike]) -> Dict[str, str]:
"""
Get the ground truth string and x-y data from the given JSON file.
:param filepath: The path to the JSON file
:return dict: A dictionary containing the ground truth string, x-y data, chart type, id, and source
"""
import json
from pathlib import Path
filepath = Path(filepath)
with open(filepath) as fp:
data = json.load(fp)
all_x, all_y = self.process_data_series(data.get("data-series", []))
chart_type = data.get('chart-type', '')
chart_str = self.create_chart_string(chart_type)
x_str = self.create_coordinate_string("x", all_x)
y_str = self.create_coordinate_string("y", all_y)
gt_string = chart_str + x_str + y_str
return {
"ground_truth": gt_string,
"x": json.dumps(all_x),
"y": json.dumps(all_y),
"chart-type": chart_type,
"id": filepath.stem,
"source": data.get("source", ''),
}
def gen_data(self,files: List[str]) -> Dict[str, str]:
"""
This function takes a list of json files and returns a generator that yields a
dictionary with the ground truth string and the path to the image.
:param files (list): A list of json files
:return generator: A generator that yields a dictionary with the ground truth string and the path to the corresponding image.
"""
for f in files:
# Extract image ID from the file path
image_id = f.split("/")[-1].split(".")[0]
# Construct the image path based on the ID
image_path = self.TRAIN_IMAGES_FOLDER + image_id + ".jpg"
# Yield a dictionary containing ground truth string, image path, and other information
yield {
**self.get_gt_string_and_xy(f),
"image_path": image_path,
}
def get_generator(self):
return HFDataset.from_generator(
self.gen_data, gen_kwargs={"files": glob(self.TRAIN_JSON_FOLDER + "*.json")}, num_proc=config.NUM_PROCESS
)
I found a solution is to use a class to group all the functions I need, but I still have to use import into the function to work