Convert Microarray Data to Excel Files
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

104 lines
3.8 KiB

import os
from collections import namedtuple
class ValidationError(ValueError):
pass
Validator = namedtuple('Validator', 'type extension validate id_fields data_fields defaults')
DataFile = namedtuple('DataFile', 'path separator skip validator')
def validate_stx(lines):
iterator = enumerate(lines)
_, line = next(iterator)
if not line.startswith('Report_Format:\t2'):
raise ValidationError('1 Unsupported File;' + line)
for i, line in iterator:
if line.startswith('Probe_Name\t'):
break
else:
raise ValidationError('1 Unsupported File')
try:
_, line = next(iterator)
_, numeric_data = line.split('\t', 1)
except (ValueError, StopIteration):
raise ValidationError('No Data Present')
separator = ',' if numeric_data.count(',') > numeric_data.count('.') else '.'
return separator, i
def validate_dat(lines):
iterator = enumerate(lines)
_, line = next(iterator)
if not line.startswith('Report_Format:\t2'):
raise ValidationError('Unsupported File')
for i, line in iterator:
if line.startswith('Dot_Number\t'):
break
else:
raise ValidationError('Unsupported File')
try:
_, line = next(iterator)
_, _, _, numeric_data = line.split('\t', 3)
except (ValueError, StopIteration):
raise ValidationError('No Data Present')
separator = ',' if numeric_data.count(',') > numeric_data.count('.') else '.'
return separator, i
def validate_csv(lines):
iterator = iter(lines)
if not next(iterator).startswith(' ID '):
raise ValidationError('Unsupported File')
try:
line = next(iterator)
_, numeric_data = line.split('\t', 1)
except (ValueError, StopIteration):
raise ValidationError('No Data Present')
separator = ',' if numeric_data.count(',') > numeric_data.count('.') else '.'
return separator, 0
stx_validator = Validator(
'Signalyse Statistic Files', '.stx', validate_stx,
['Probe_Name'],
['Count', 'Net_Signal', 'Net_Signal_SD', 'Net_Integral', 'Net_Integral_SD', 'Proc_Control'],
['Net_Signal', 'Net_Signal_SD', 'Net_Integral', 'Net_Integral_SD'])
dat_validator = Validator(
'Signalyse Data Files', '.dat', validate_dat,
['Dot_Number', 'Probe_Name', 'Gene_Name', 'Col', 'Row'],
['X[Pix]', 'Y[Pix]', 'DX[Pix]', 'DY[Pix]', 'Spot_Diameter', 'ROI_Width', 'ROI_Heigth', 'Pixels', 'Bkg',
'Bkg_SD', 'Net_Signal', 'Net_Signal_SD', 'Net_Integral', 'Net_Integral_SD', 'Acc_Number', 'Proc_Control'],
['Bkg', 'Net_Signal', 'Net_Integral'] )
csv_validator = Validator(
'Sensovation Data Files', '.csv', validate_csv,
[' ID '],
['Pos.X', 'Pos.Y', 'Bkg.Mean', 'Spot.Mean', 'Bkg.Median', 'Spot.Median', 'Bkg.StdDev', 'Spot.StdDev',
'Bkg.Sum', 'Spot.Sum', 'Bkg.Area', 'Spot.Area', 'Spot.Sat. (%)', 'Found', 'Pos.Nom.X', 'Pos.Nom.Y', 'Dia.',
'Rect.', 'Contour'],
['Bkg.Mean', 'Spot.Mean', 'Bkg.Median', 'Spot.Median', 'Bkg.StdDev', 'Spot.StdDev', 'Bkg.Sum', 'Spot.Sum'] )
validation_map = {
stx_validator.extension: stx_validator,
dat_validator.extension: dat_validator,
csv_validator.extension: csv_validator
}
def guess_validator(unvalidated):
# get the validation method by examining the first file
_, extension = os.path.splitext(unvalidated[0])
return validation_map.get(extension, None)
def validate_files(unvalidated, selected_validator):
# get the validation method by examining the first file
for file_path in unvalidated:
try:
with open(file_path, mode='r', encoding='utf-8') as file_handle:
separator, skip = selected_validator.validate(file_handle)
yield DataFile(file_path, separator, skip, selected_validator)
except (IOError, UnicodeError, ValidationError) as e:
print(e)