很久没有更新博客了,最近handle了一个项目需要处理大量csv文件,需求是将csv文件中某单个列的多个values(该多个vaules以回车分隔)拆分成多个相同列的单个value,由于python在处理脚本先天比shell更有优势,这里把自己的最近学到python知识加以应用,编写了一个脚本,供大家参考.
#!/usr/bin/env python import csv import sys # Define the duality list Csv_content_edited = [] # Define the file that needs to be handled. try: file_name = sys.argv[1] new_file_name = file_name.split('.')[0] + '_new.' + file_name.split('.')[1] except IndexError: pass # Define the collumns that need to be dealt with. Field = ['Affects Version/s','Fix Version/s','Component/s'] """ Search the index numbers of each components of "Field" list in the first row of csv file, return the dic["field value":"the index of first_row"] """ def search_field_index(field, first_row): fields_dic = {} for x in first_row: if x in field: fields_dic[x] = first_row.index(x) return fields_dic ''' Find the maximum value number of each "Field" components in "Csv_content_edited" list, then insert the actual number of the columns after each "Field" columns. ''' def insert_max_col(field_index,Field_val): # Define the list stored maximum number of the value of the field index_len = [] # Define the list stored the value of the field index_val = [] # Recursive the row from 2 ~ max of the csv field for r in range(1,len(Csv_content_edited)): # Deal with special string c = repr(Csv_content_edited[r][field_index]).split("\\r") index_len.append(len(c)) index_val.append(c) # The maximum component of the "index_len" list max_index_len = max(index_len) # Clear " " and "'" string which are in the head or tail of each "index_val" components. for iv in range(0,len(index_val)): for _iv in range(0,len(index_val[iv])): index_val[iv][_iv] = index_val[iv][_iv].strip(' \'') # insert null value collumn after the field from "Field" list to the "Csv_content_edited" list for i in range(0,len(Csv_content_edited)): for m in range(0,max_index_len - 1): Csv_content_edited[i].insert(field_index + m + 1,'') # Update the first row of "Csv_content_edited" list to the field components from "Field" list for i1 in range(field_index,field_index + max_index_len): Csv_content_edited[0][i1] = Field_val # Update the rest rows of "Csv_content_edited" list to the field components from "index_val" list for i in range(1,len(Csv_content_edited)): for i2 in range(field_index,field_index + max_index_len): try: Csv_content_edited[i][i2] = index_val[i-1][i2 - field_index] except IndexError as s: pass # return Csv_content_edited[3],index_val,max_index_len # return len(Csv_content_edited),len(index_val) # print max_index_len # print index_val[2][0] # return len(index_val[2]) # Read and deal with the csv file. def readContent(file_name): # Read the csv file,then put it into list. with open(file_name, 'r') as csvfile: csv_reader = csv.reader(csvfile, delimiter=',') for row in csv_reader: if row[0] == 'Key': first_row = [] first_row = row # insert first row of csv to the list Csv_content_edited.append(first_row) else: # insert rest row of csv to the list Csv_content_edited.append(row) else: # Obtain all current "Field" index in "Csv_content_edited" list fields_index = search_field_index(Field,first_row) # Obtain the index of "Affects_Version" in in "Csv_content_edited" list Affects_Version_index = fields_index[Field[0]] # insert the matching value insert_max_col(Affects_Version_index,Field[0]) # Obtain all current "Field" index in "Csv_content_edited" list first_row_updated = Csv_content_edited[0] fields_index = search_field_index(Field,first_row_updated) # Obtain the index of "Affects_Version" in in "Csv_content_edited" list Fix_Version_index = fields_index[Field[1]] # insert the matching value insert_max_col(Fix_Version_index,Field[1]) # Obtain all current "Field" index in "Csv_content_edited" list first_row_updated = Csv_content_edited[0] fields_index = search_field_index(Field,first_row_updated) # Obtain the index of "Affects_Version" in in "Csv_content_edited" list Fix_Version_index = fields_index[Field[2]] # insert the matching value insert_max_col(Fix_Version_index,Field[2]) # Obtain the final edited list. return Csv_content_edited # Write the csv file. def writeContent(): with open(new_file_name,'wb') as csvfile: csv_writer = csv.writer(csvfile) csv_writer.writerows(readContent(file_name)) # Execute the finnal function. if __name__ == '__main__': try: writeContent() except (IOError,NameError,IndexError): print "Please type the correct file name. e.g: '" + sys.argv[0] + " testfile.csv'" else: print 'The result file is: %s' %new_file_name
处理前的效果
处理后的效果
如有雷同,纯属扯淡...