Source code for file_io.remove_psv_spaces

import argparse

[docs] PRE_HEADER_COMMENT_AND_EXCLUDE_STRINGS = ("#", "!")
[docs] def main(): parser = argparse.ArgumentParser( prog="layup utility - remove psv spaces", formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This utility removes spaces from Rubin-formatted PSV files to ensure compatibility with other tools.", ) req = parser.add_argument_group("Required arguments") req.add_argument( "-i", "--input-file", help="The file path to the input PSV file that needs spaces removed.", type=str, dest="input_file", required=True, ) req.add_argument( "-o", "--output-file", help="The file path to the output PSV file with spaces removed.", type=str, dest="output_file", required=True, ) optional = parser.add_argument_group("Optional arguments") optional.add_argument( "-f", "--force", help="Force overwrite of existing files without prompting.", type=bool, dest="force", default=False, required=False, ) args = parser.parse_args() return execute(args)
[docs] def execute(args): from pathlib import Path import pandas as pd # Do a little input validation input_file_path = Path(args.input_file).resolve() if not input_file_path.exists(): print(f"Error: Input file '{input_file_path}' does not exist.") return 1 output_file_path = Path(args.output_file).resolve() if output_file_path.exists() and not args.force: print(f"Error: Output file '{output_file_path}' already exists.") print("Use --force to overwrite the existing file.") return 1 # It's not necessary carry both of these variables, but it makes the logic a bit more clear. num_pre_header_lines = 0 header_row_index = 0 with open(input_file_path) as fh: for i, line in enumerate(fh): # If the line starts with a comment character, increment the pre-header line count if line.startswith(PRE_HEADER_COMMENT_AND_EXCLUDE_STRINGS): num_pre_header_lines += 1 else: # Note - header row INDEX is 0-indexed. header_row = line header_row_index = num_pre_header_lines break # skip_rows is used to prevent pd.read_csv from trying to read the pre-header comments. skip_rows = [] if header_row_index > 0: skip_rows = [i for i in range(0, header_row_index)] # Define the pd.read_csv "converters" functions to process each value in all columns. column_converters = {col_name: str.strip for col_name in header_row.strip().split("|")} # Read in the PSV file, removing leading and trailing spaces from all values in all columns. res_df = pd.read_csv(input_file_path, sep="|", skiprows=skip_rows, converters=column_converters) # Update the names of the columns to remove any leading or trailing spaces. res_df.columns = [col.strip() for col in res_df.columns] # Copy over the comments at the top of the input file with open(input_file_path) as input_file: with open(output_file_path, "w") as output_file: for _ in range(num_pre_header_lines): output_file.write(input_file.readline()) # Write the header and data to the output file. res_df.to_csv(output_file_path, sep="|", mode="w", index=False)
if __name__ == "__main__": main()