merge

Simple tool to quickly merge datasets for statistical analysis
git clone git://git.wrycode.com/wrycode/archive/merge.git
Log | Files | Refs | README | LICENSE

main.go (11362B)


      1 package main
      2 
      3 import (
      4 	"fmt"
      5 	"os"
      6 	"log"
      7 	"path/filepath"
      8 	"strings"
      9 	"C"
     10 	"unsafe"
     11 	"github.com/360EntSecGroup-Skylar/excelize"
     12 	"github.com/knieriem/odf/ods"
     13 	"encoding/csv"
     14 	"github.com/gotk3/gotk3/gtk"
     15 	//	"github.com/gotk3/gotk3/gdk"
     16 )
     17 
     18 // object that holds every widget
     19 type view struct {
     20 	mainBox *gtk.Box
     21 	datasetListBox *gtk.ListBox
     22 	addButton *gtk.Button
     23 	addDatasetDialog *gtk.FileChooserNativeDialog
     24 	saveDialog *gtk.FileChooserNativeDialog
     25 	errorLabel *gtk.Label
     26 	mergeButton *gtk.Button
     27 }
     28 
     29 // gtk boilerplate
     30 func windowSetup() *gtk.Window {
     31 
     32 	os.Setenv("GSETTINGS_SCHEMA_DIR", ".\\share\\glib-2.0\\schemas")
     33 	// Set up GTK
     34 	gtk.Init(&os.Args)
     35 
     36 	window, err := gtk.WindowNew(gtk.WINDOW_TOPLEVEL)
     37 	if err != nil {
     38 		log.Fatal("Unable to create window:", err)
     39 	}
     40 	window.SetTitle("Merge Datasets")
     41 	window.Connect("destroy", func() {
     42 		gtk.MainQuit()
     43 	})
     44 
     45 	window.SetName("main-window") // for css selector
     46 
     47 	return window
     48 }
     49 
     50 // returns an initial view object (a skeleton that still needs the
     51 // widgets initialized)
     52 func viewSetup() view {
     53 
     54 	var v view
     55 	var err error
     56 
     57 	// main box container
     58 	v.mainBox, err = gtk.BoxNew(gtk.ORIENTATION_VERTICAL, 5)
     59 	if err != nil {
     60 		log.Fatal("Unable to create grid: ", err)
     61 	}
     62 
     63 	// listBox of dataset names + remove buttons
     64 	v.datasetListBox, err = gtk.ListBoxNew()
     65 	v.mainBox.PackStart(v.datasetListBox,false, false, 0)
     66 
     67 	// button to add new datasets
     68 	v.addButton, err = gtk.ButtonNewWithLabel("Add Dataset(s)")
     69 	if err != nil {
     70 		log.Fatal("unable to create addButton")
     71 	}
     72 	v.mainBox.PackStart(v.addButton, false, false, 0)
     73 
     74 
     75 	// merge datasets
     76 	v.mergeButton, err = gtk.ButtonNewWithLabel("Merge Datasets")
     77 	if err != nil {
     78 		log.Fatal("Unable to create mergeButton", err)
     79 	}
     80 	v.mainBox.PackStart(v.mergeButton, false, false, 0)
     81 
     82 	v.errorLabel = newLabel("")
     83 	v.mainBox.PackStart(v.errorLabel, false, false, 0)
     84 
     85 	return v
     86 }
     87 
     88 // connect the "Add Dataset" button to the dialog box and the update
     89 // the list of datasets
     90 func addButtonSetup(v view, filepaths *[]string, window *gtk.Window) {
     91 
     92 	var err error
     93 
     94 	// native file chooser
     95 	v.addDatasetDialog, err = gtk.FileChooserNativeDialogNew("open",window,gtk.FILE_CHOOSER_ACTION_OPEN,"open","cancel")
     96 	if err != nil {
     97 		log.Fatal("Can't make addDatasetDialog: ", err)
     98 	}
     99 	// user can add multiple datasets
    100 	v.addDatasetDialog.SetSelectMultiple(true)
    101 
    102 	// filters for acceptable filetypes to open
    103 	CsvFilter, err := gtk.FileFilterNew(); CsvFilter.AddPattern("*.csv"); CsvFilter.SetName("Comma Separated Values")
    104 	ExcelFilter, err := gtk.FileFilterNew(); ExcelFilter.AddPattern("*.xlsx"); ExcelFilter.SetName("Microsoft Excel")
    105 	OdsFilter, err := gtk.FileFilterNew(); OdsFilter.AddPattern("*.ods"); OdsFilter.SetName("Open Document Spreadsheet")
    106 	AllFilter, err := gtk.FileFilterNew(); AllFilter.AddPattern("*"); AllFilter.SetName("All File Types")
    107 	SpreadsheetFilter, err := gtk.FileFilterNew();
    108 	SpreadsheetFilter.AddPattern("*.csv");
    109 	SpreadsheetFilter.AddPattern("*.ods");
    110 	SpreadsheetFilter.AddPattern("*.xlsx");
    111 	SpreadsheetFilter.SetName("All Spreadsheets")
    112 
    113 	v.addDatasetDialog.AddFilter(CsvFilter)
    114 	v.addDatasetDialog.AddFilter(ExcelFilter)
    115 	v.addDatasetDialog.AddFilter(OdsFilter)
    116 	v.addDatasetDialog.AddFilter(AllFilter)
    117 	v.addDatasetDialog.AddFilter(SpreadsheetFilter)
    118 	v.addDatasetDialog.SetFilter(SpreadsheetFilter)
    119 
    120 	_, err = v.addButton.Connect("clicked", func() {
    121 		var response gtk.ResponseType
    122 		response = gtk.ResponseType(v.addDatasetDialog.Run())
    123 		list, err := v.addDatasetDialog.GetFilenames()
    124 		if err == nil && response == gtk.RESPONSE_ACCEPT {
    125 			// list is a *glib.SList returned by GetFilenames.
    126 			// glib.SList.Foreach iterates over items in a list
    127 			// and provides unsafe.Pointers to the C data. Here we
    128 			// can convert the C []chars to Golang strings using
    129 			// cgo
    130 			list.Foreach(func(ptr unsafe.Pointer) {
    131 				filename := C.GoString((*C.char)(ptr))
    132 				// add path to list
    133 				*filepaths = append(*filepaths, filename)
    134 			})
    135 			// refresh the view
    136 			rebuildDatasetListBox(v.datasetListBox, filepaths, window)
    137 			window.ShowAll()
    138 		}
    139 	})
    140 }
    141 
    142 // connect the "Merge Datasets" button to the correct output file
    143 // chooser and the logic for merging and exporting the datasets
    144 func mergeButtonSetup(v view, filenames *[]string, window *gtk.Window) {
    145 	var err error
    146 
    147 	// another native file chooser
    148 	v.saveDialog, err = gtk.FileChooserNativeDialogNew("save",window,gtk.FILE_CHOOSER_ACTION_SAVE,"save","cancel")
    149 	if err != nil {
    150 		log.Fatal("Can't make saveDialog: ", err)
    151 	}
    152 
    153 	// unfortunately gtk doesn't seem to provide a way to
    154 	// enforce file extensions when creating a file with a
    155 	// gtkfilechooser, so we have to do it manually. These
    156 	// filters just amount to helpful hints for the user
    157 	CsvFilter, err := gtk.FileFilterNew(); CsvFilter.AddPattern("*.csv"); CsvFilter.SetName("Comma Separated Values")
    158 	ExcelFilter, err := gtk.FileFilterNew(); ExcelFilter.AddPattern("*.xlsx"); ExcelFilter.SetName("Microsoft Excel")
    159 	v.saveDialog.AddFilter(CsvFilter)
    160 	v.saveDialog.AddFilter(ExcelFilter)
    161 	v.saveDialog.SetFilter(CsvFilter)
    162 	v.saveDialog.SetFilename("merged.csv")
    163 	v.saveDialog.SetDoOverwriteConfirmation(true) // kinda important
    164 
    165 	_, err = v.mergeButton.Connect("clicked", func() {
    166 		// clear old error
    167 		v.errorLabel.SetText("")
    168 
    169 		if goodFileExtensions(filenames) && len(*filenames) != 0 {
    170 			response := gtk.ResponseType(v.saveDialog.Run())
    171 
    172 			if response == gtk.RESPONSE_ACCEPT {
    173 				outputFile := v.saveDialog.GetFilename()
    174 
    175 				// supported file extensions and their associated function for pulling the raw data
    176 				fileFormat := map[string]func(string) [][]string{
    177 					".xlsx": pullExcel,
    178 					".csv": pullCSV,
    179 					".ods": pullODS,
    180 				}
    181 
    182 				// map of basenames to the [][]string data
    183 				raws := make(map[string][][]string)
    184 
    185 				// all datasets
    186 				datasets := make(map[string]*Dataset)
    187 
    188 				// import dataset based on file extension
    189 				for _, path := range *filenames {
    190 					basename := strings.TrimSuffix(filepath.Base(path), filepath.Ext(path))
    191 					ext := strings.ToLower(filepath.Ext(path))
    192 					if _, ok := fileFormat[ext]; ok {
    193 						raws[basename] = fileFormat[ext](path)
    194 					}
    195 					for name, data := range raws {
    196 						datasets[name] = ImportDataset(name, data)
    197 					}
    198 				}
    199 
    200 				// merge them into a single Dataset
    201 				var dataset Dataset
    202 				dataset.height = 1 // row of terms, even though it's empty
    203 				dataset.data = make(map[string][]string)
    204 				for _, d := range datasets {
    205 					mergeDatasets(&dataset,d)
    206 				}
    207 
    208 				// export dataset
    209 				exportDataset(&dataset, outputFile)
    210 			}
    211 		} else {
    212 			v.errorLabel.SetText("There was a problem with the files you chose. Make sure to choose supported spreadsheet formats.")
    213 		}
    214 	})
    215 }
    216 
    217 // rebuild the list of selected dataset files (with "delete" buttons)
    218 // and refresh the window
    219 func rebuildDatasetListBox(list *gtk.ListBox, filenames *[]string, window *gtk.Window) {
    220 	// clear list
    221 	list.GetChildren().Foreach(func(item interface{}) {
    222 		list.Remove(item.(*gtk.Widget))
    223 	})
    224 
    225 	for _, filename := range *filenames {
    226 		// local variable necessary; see https://github.com/golang/go/wiki/CommonMistakes#using-goroutines-on-loop-iterator-variables
    227 		path := filename
    228 
    229 		// box to hold the two elements
    230 		box, err := gtk.BoxNew(gtk.ORIENTATION_HORIZONTAL, 2)
    231 
    232 		// show the filename
    233 		text := newLabel(filename)
    234 		text.SetSelectable(true)
    235 		box.PackStart(text, false, false, 0)
    236 
    237 		// add the "Remove" button
    238 		removeButton, err := gtk.ButtonNewWithLabel("Remove")
    239 		if err != nil {
    240 			log.Fatal("Unable to create removeButton", err)
    241 		}
    242 		box.PackEnd(removeButton,false, false, 0)
    243 		trashImage, err := gtk.ImageNewFromFile("trash.png")
    244 		removeButton.SetImage(trashImage)
    245 
    246 		// "Remove button" removes the path from the list of filepaths and runs this function again
    247 		_, err = removeButton.Connect("clicked", func() {
    248 			*filenames = Remove(*filenames, path)
    249 			rebuildDatasetListBox(list, filenames, window)
    250 			window.ShowAll()
    251 		})
    252 
    253 		// add the two elements to the overall list
    254 		list.Insert(box, 0)
    255 	}
    256 }
    257 
    258 func newLabel(s string) *gtk.Label {
    259 	l, err := gtk.LabelNew(s)
    260 	if err != nil {
    261 		log.Fatal("Unable to create label",s,":", err)
    262 	}
    263 	return l
    264 }
    265 
    266 // very dumb file extension checking
    267 func goodFileExtensions(filepaths *[]string) bool {
    268 	for _, path := range *filepaths {
    269 		ext := strings.ToLower(filepath.Ext(path))
    270 		if ext != ".csv" && ext != ".xlsx" && ext != ".ods" {
    271 			return false
    272 		}
    273 	}
    274 	return true
    275 }
    276 
    277 func pullExcel(path string) [][]string {
    278 	var rows [][]string
    279 	f, err := excelize.OpenFile(path)
    280 	if err != nil {
    281 		fmt.Println("Problem importing ", path, err)
    282 		return rows
    283 	}
    284 
    285 	sheetName := f.GetSheetMap()[1]
    286 	// fmt.Println(sheetName)
    287 
    288 	rows, err = f.GetRows(sheetName)
    289 	if err != nil {
    290 		fmt.Println("Problem getting rows from ", path, err)
    291 	}
    292 	return rows
    293 }
    294 
    295 func pullCSV(path string) [][]string {
    296 	var rows [][]string
    297 
    298 	f, err := os.Open(path)
    299 	if err != nil {
    300 		fmt.Printf("Cannot open '%s': %s\n", path, err.Error())
    301 		return rows
    302 	}
    303 	defer f.Close()
    304 
    305 	// TODO do I need to close the reader?
    306 	r := csv.NewReader(f)
    307 	r.LazyQuotes = true
    308 	rows, err = r.ReadAll()
    309 
    310 	if err != nil {
    311 		fmt.Println("trouble reading rows in ", path)
    312 		return rows
    313 	}
    314 	return rows
    315 }
    316 
    317 func pullODS(path string) [][]string {
    318 	var rows [][]string
    319 
    320 	f, err := ods.Open(path)
    321 	if err != nil {
    322 		fmt.Println("trouble reading ods file", path,":",err)
    323 		return rows
    324 	}
    325 
    326 	var d ods.Doc
    327 
    328 	err = f.ParseContent(&d)
    329 
    330 	if err != nil {
    331 		fmt.Println("trouble parsing ods file", path,":",err)
    332 		return rows
    333 	}
    334 
    335 	rows = d.Table[0].Strings()
    336 	return rows
    337 }
    338 
    339 // see import.go for Dataset fields and methods used in this file
    340 func mergeDatasets(base *Dataset, new *Dataset) {
    341 	numOld := base.height - 1
    342 	numNew := new.height - 1 // number of specimens being added
    343 
    344 	// on the first merge the base dataset is empty (no terms or data)
    345 	if !Include(base.terms, "DATASET") {
    346 		base.terms = append(base.terms, "DATASET")
    347 	}
    348 
    349 	// add DATASET term to show which file the specimen comes from
    350 	new.terms = append(new.terms, "DATASET")
    351 	new.data["DATASET"] = make([]string, numNew)
    352 	for i, _ := range new.data["DATASET"] {
    353 		new.data["DATASET"][i] = new.name
    354 	}
    355 
    356 	// add blank data for new terms not present in the old dataset
    357 	for _, term := range new.terms {
    358 		if !Include(base.terms, term) {
    359 			base.terms = append(base.terms, term)
    360 			base.data[term] = make([]string, numOld)
    361 		}
    362 	}
    363 
    364 	// add blank data for old terms not present in the new dataset
    365 	for _, term := range base.terms {
    366 		if !Include(new.terms, term) {
    367 			new.terms = append(new.terms, term)
    368 			new.data[term] = make([]string,  numNew)
    369 		}
    370 	}
    371 
    372 	// datasets are equalized now (no new data, just blank terms
    373 	// where appropriate) so merging them is a simple append
    374 	for _, term := range base.terms {
    375 		base.data[term] = append(base.data[term], new.data[term]...)
    376 	}
    377 
    378 	// update height
    379 	base.height += numNew
    380 }
    381 
    382 func main() {
    383 
    384 	window := windowSetup()
    385 
    386 	// struct containing the main widgets
    387 	v := viewSetup()
    388 
    389 	// holds the paths to the datasets to be merged; this var is
    390 	// passed around when removing and adding datasets
    391 	filepaths := &[]string{}
    392 
    393 	addButtonSetup(v, filepaths, window)
    394 	mergeButtonSetup(v, filepaths, window)
    395 
    396 	// refresh the (empty) list of datasets for the first time
    397 	rebuildDatasetListBox(v.datasetListBox, filepaths, window)
    398 
    399 	window.Add(v.mainBox)
    400 
    401 	window.ShowAll()
    402 	gtk.Main()
    403 }
    404 
    405