import os from collections import namedtuple class ValidationError(ValueError): pass Validator = namedtuple('Validator', 'type extension validate id_fields data_fields defaults') DataFile = namedtuple('DataFile', 'path separator') def validate_stx(lines): iterator = iter(lines) line = next(iterator) if not line.startswith('Report_Format:\t2'): raise ValidationError('1 Unsupported File;' + line) for line in iterator: if line.startswith('Probe_Name\t'): break else: raise ValidationError('1 Unsupported File') try: line = next(iterator) _, numeric_data = line.split('\t', 1) except (ValueError, StopIteration): raise ValidationError('No Data Present') separator = ',' if numeric_data.count(',') > numeric_data.count('.') else '.' return separator def validate_dat(lines): iterator = iter(lines) if not next(iterator).startswith('Report_Format:\t2'): raise ValidationError('Unsupported File') for line in iterator: if line.startswith('Dot_Number\t'): break else: raise ValidationError('Unsupported File') try: line = next(iterator) _, _, _, numeric_data = line.split('\t', 3) except (ValueError, StopIteration): raise ValidationError('No Data Present') separator = ',' if numeric_data.count(',') > numeric_data.count('.') else '.' return separator def validate_csv(lines): iterator = iter(lines) if not next(iterator).startswith(' ID '): raise ValidationError('Unsupported File') try: line = next(iterator) _, numeric_data = line.split('\t', 1) except (ValueError, StopIteration): raise ValidationError('No Data Present') separator = ',' if numeric_data.count(',') > numeric_data.count('.') else '.' return separator stx_validator = Validator( 'Signalyse Statistic Files', '.stx', validate_stx, ['Probe_Name'], ['Count', 'Net_Signal', 'Net_Signal_SD', 'Net_Integral', 'Net_Integral_SD', 'Proc_Control'], ['Net_Signal', 'Net_Signal_SD', 'Net_Integral', 'Net_Integral_SD']) dat_validator = Validator( 'Signalyse Data Files', '.dat', validate_dat, ['Dot_Number', 'Probe_Name', 'Gene_Name', 'Col', 'Row'], ['X[Pix]', 'Y[Pix]', 'DX[Pix]', 'DY[Pix]', 'Spot_Diameter', 'ROI_Width', 'ROI_Heigth', 'Pixels', 'Bkg', 'Bkg_SD', 'Net_Signal', 'Net_Signal_SD', 'Net_Integral', 'Net_Integral_SD', 'Acc_Number', 'Proc_Control'], ['Bkg', 'Bkg_SD', 'Net_Signal', 'Net_Signal_SD', 'Net_Integral', 'Net_Integral_SD'] ) csv_validator = Validator( 'Sensovation Data Files', '.csv', validate_csv, [' ID '], ['Pos.X', 'Pos.Y', 'Bkg.Mean', 'Spot.Mean', 'Bkg.Median', 'Spot.Median', 'Bkg.StdDev', 'Spot.StdDev', 'Bkg.Sum', 'Spot.Sum', 'Bkg.Area', 'Spot.Area', 'Spot.Sat. (%)', 'Found', 'Pos.Nom.X', 'Pos.Nom.Y', 'Dia.', 'Rect.', 'Contour'], ['Bkg.Mean', 'Spot.Mean', 'Bkg.Median', 'Spot.Median', 'Bkg.StdDev', 'Spot.StdDev', 'Bkg.Sum', 'Spot.Sum'] ) validation_map = { stx_validator.extension: stx_validator, dat_validator.extension: dat_validator, csv_validator.extension: csv_validator } def guess_validator(unvalidated): # get the validation method by examining the first file _, extension = os.path.splitext(unvalidated[0]) return validation_map.get(extension, None) def validate_files(unvalidated, selected_validator): # get the validation method by examining the first file for file_path in unvalidated: try: with open(file_path, mode='r', encoding='utf-8') as file_handle: separator = selected_validator.validate(file_handle) yield DataFile(file_path, separator) except (IOError, UnicodeError, ValidationError) as e: print(e)