merge

Simple tool to quickly merge datasets for statistical analysis
git clone git://git.wrycode.com/wrycode/archive/merge.git
Log | Files | Refs | README | LICENSE

commit 58356b34ac917bd7084566c48ea681d25f28e9ca
parent 151b56db47f3dfaf48024eb60c93e61c55d5fcf8
Author: Nick Econopouly <wry@mm.st>
Date:   Thu,  5 Mar 2020 18:30:15 -0500

Fix silly bug in mergeDatasets

Diffstat:
Mimport.go | 43++++++++++++-------------------------------
Mmain.go | 155+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
2 files changed, 141 insertions(+), 57 deletions(-)

diff --git a/import.go b/import.go @@ -11,38 +11,19 @@ type Dataset struct { data map[string][]string // maps terms to ordered data (columns) terms []string // ordered list of terms name string // name of the dataset - height int // height of the XLSX file (including the first row!) + height int // height of the XLSX file (including the first row of terms) } -// ImportXLSX imports an XLSX file. It takes filename, sheet name, -// name for the Dataset internally, and height in rows as arguments -// and returns a *Dataset. -func ImportXLSX(filename string) *Dataset { +func ImportDataset(name string, rows [][]string) *Dataset { + fmt.Println("name is ", name) // Initialize Dataset var d Dataset d.data = make(map[string][]string) - d.name = filename - // d.height = height - // pull data from file - f, err := excelize.OpenFile(filename) - if err != nil { - fmt.Println("Problem importing ", filename, err) - return &d - } - - sheetName := f.GetSheetMap()[1] - fmt.Println(sheetName) - - rows, err := f.GetRows(sheetName) - - if err != nil { - fmt.Println("Cannot read XLSX data in", filename, err.Error()) - return &d - } d.height = len(rows) d.terms = rows[0] + d.name = name // Create blank column for each term for _, term := range d.terms { @@ -77,15 +58,15 @@ func (d *Dataset) removeUnusedTerms() { } } -func ImportDatasets(paths []string) map[string]*Dataset { - datasets := make(map[string]*Dataset) +// func ImportDatasets(paths []string) map[string]*Dataset { +// datasets := make(map[string]*Dataset) - for _, file := range paths { - // fmt.Println(file) - datasets[file] = ImportXLSX(file) - } - return datasets -} +// for _, file := range paths { +// // fmt.Println(file) +// datasets[file] = ImportXLSX(file) +// } +// return datasets +// } func exportDataset(d *Dataset, name string) { f := excelize.NewFile() diff --git a/main.go b/main.go @@ -4,62 +4,165 @@ import ( "fmt" "os" "log" - "path/filepath" + "path/filepath" + "strings" + "github.com/360EntSecGroup-Skylar/excelize" + "github.com/knieriem/odf/ods" + "encoding/csv" ) -func getExcelFiles() []string { - var files []string +const EXPORT_BASE_FILENAME = "merged" + +func pullExcel(path string) [][]string { + var rows [][]string + f, err := excelize.OpenFile(path) + if err != nil { + fmt.Println("Problem importing ", path, err) + return rows + } + + sheetName := f.GetSheetMap()[1] + // fmt.Println(sheetName) + + rows, err = f.GetRows(sheetName) + if err != nil { + fmt.Println("Problem getting rows from ", path, err) + } + return rows +} + +func pullCSV(path string) [][]string { + var rows [][]string - path, err := os.Getwd() + f, err := os.Open(path) if err != nil { - log.Println(err) + fmt.Printf("Cannot open '%s': %s\n", path, err.Error()) + return rows } + defer f.Close() - fmt.Println("Attempting to merge Microsoft Excel files in ", path) + // TODO do I need to close the reader? + r := csv.NewReader(f) + r.LazyQuotes = true + rows, err = r.ReadAll() - err = filepath.Walk(path, func(path string, info os.FileInfo, err error) error { - if info.IsDir() { - return nil - } - if filepath.Base(path) == "merged.xlsx" { // ignore our output file - return nil - } - if filepath.Ext(path) == ".xlsx" || filepath.Ext(path) == ".XLSX" { - files = append(files, path) - } - return nil - }) if err != nil { - panic(err) + fmt.Println("trouble reading rows in ", path) + return rows } - return files + return rows +} + +func pullODS(path string) [][]string { + var rows [][]string + + f, err := ods.Open(path) + if err != nil { + fmt.Println("trouble reading ods file", path,":",err) + return rows + } + + var d ods.Doc + + err = f.ParseContent(&d) + + if err != nil { + fmt.Println("trouble parsing ods file", path,":",err) + return rows + } + + rows = d.Table[0].Strings() + return rows + } func mergeDatasets(base *Dataset, new *Dataset) { numOld := base.height - 1 - numNew := new.height - 1 // number of specimens being added + numNew := new.height - 1 // number of specimens being added + + // on the first merge the base dataset is empty (no terms or data) + if !Include(base.terms, "DATASET") { + base.terms = append(base.terms, "DATASET") + } + + // add DATASET term + new.terms = append(new.terms, "DATASET") + new.data["DATASET"] = make([]string, numNew) + for i, _ := range new.data["DATASET"] { + new.data["DATASET"][i] = new.name + } + // fmt.Println(new.name) + // add blank data for new terms not present in the old dataset for _, term := range new.terms { if !Include(base.terms, term) { + fmt.Println("base.terms is ", base.terms, " and term is", term) base.terms = append(base.terms, term) base.data[term] = make([]string, numOld) } + } + // add blank data for old terms not present in the new dataset + for _, term := range base.terms { + if !Include(new.terms, term) { + fmt.Println("new.terms is ", new.terms, " and term is", term) + new.terms = append(new.terms, term) + new.data[term] = make([]string, numNew) + } } + // datasets are equalized (no new data, just blank terms where + // appropriate) so merging them is a simple append for _, term := range base.terms { - base.data[term] = append(base.data[term], new.data[term]...) + base.data[term] = append(base.data[term], new.data[term]...) } base.height += numNew } func main() { - // get list of excel files - files := getExcelFiles() - // import them all into Datasets - datasets := ImportDatasets(files) + // get set up with the current directory + wd, err := os.Getwd() + if err != nil { + log.Println(err) + } + + // supported file extensions and their associated function for pulling the raw data + fileFormat := map[string]func(string) [][]string{ + ".xlsx": pullExcel, + ".csv": pullCSV, + ".ods": pullODS, + } + + fmt.Println("Attempting to merge spreadsheet files in ", wd) + + // map of basenames to the [][]string data + raws := make(map[string][][]string) + + err = filepath.Walk(wd, func(path string, info os.FileInfo, err error) error { + if info.IsDir() { + return nil + } + basename := strings.TrimSuffix(filepath.Base(path), filepath.Ext(path)) + ext := strings.ToLower(filepath.Ext(path)) + + if basename == EXPORT_BASE_FILENAME { // ignore our output file + return nil + } + if _, ok := fileFormat[ext]; ok { + raws[basename] = fileFormat[ext](path) + } + return nil + }) + if err != nil { + panic(err) + } + + datasets := make(map[string]*Dataset) + for name, data := range raws { + datasets[name] = ImportDataset(name, data) + } // merge them into a single Dataset var dataset Dataset