merge

Simple tool to quickly merge datasets for statistical analysis
git clone git://git.wrycode.com/wrycode/archive/merge.git
Log | Files | Refs | README | LICENSE

commit 151b56db47f3dfaf48024eb60c93e61c55d5fcf8
Author: Nick Econopouly <wry@mm.st>
Date:   Tue, 21 Jan 2020 16:37:46 -0500

Initial commit.

Diffstat:
Aarrayutils.go | 42++++++++++++++++++++++++++++++++++++++++++
Aimport.go | 118+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Amain.go | 74++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Awinbuild.sh | 2++
4 files changed, 236 insertions(+), 0 deletions(-)

diff --git a/arrayutils.go b/arrayutils.go @@ -0,0 +1,42 @@ +package main + +// Index returns the first index of the target string t, or -1 if no +// match is found +func index(terms []string, s string) int { + for i, v := range terms { + if v == s { + return i + } + } + return -1 +} + +// Include returns true if the target string t is in the slice +func Include(terms []string, term string) bool { + return index(terms, term) >= 0 +} + +// Remove returns a []string with all instances of term removed, or +// unchanged if the term isn't in the slice +func Remove(terms []string, term string) []string { + var result []string + for _, t := range terms { + if t != term { + result = append(result, t) + } + } + if result != nil { + return result + } + return []string{} +} + +// notAllSame returns true if not every element of a string slice is the same +func notAllSame(s []string) bool { + for i := 0; i < len(s); i++ { + if s[i] != s[0] { + return true + } + } + return false +} diff --git a/import.go b/import.go @@ -0,0 +1,118 @@ +package main + +import ( + "fmt" + "github.com/360EntSecGroup-Skylar/excelize" + "log" +) + +// Dataset holds all of the variables and their data +type Dataset struct { + data map[string][]string // maps terms to ordered data (columns) + terms []string // ordered list of terms + name string // name of the dataset + height int // height of the XLSX file (including the first row!) +} + +// ImportXLSX imports an XLSX file. It takes filename, sheet name, +// name for the Dataset internally, and height in rows as arguments +// and returns a *Dataset. +func ImportXLSX(filename string) *Dataset { + // Initialize Dataset + var d Dataset + d.data = make(map[string][]string) + d.name = filename + // d.height = height + + // pull data from file + f, err := excelize.OpenFile(filename) + if err != nil { + fmt.Println("Problem importing ", filename, err) + return &d + } + + sheetName := f.GetSheetMap()[1] + fmt.Println(sheetName) + + rows, err := f.GetRows(sheetName) + + if err != nil { + fmt.Println("Cannot read XLSX data in", filename, err.Error()) + return &d + } + d.height = len(rows) + + d.terms = rows[0] + + // Create blank column for each term + for _, term := range d.terms { + d.data[term] = make([]string, d.height-1) // specimens start on the second row + } + + // Fill in columns + for j, row := range rows[1:d.height] { + for i, value := range row { + d.data[d.terms[i]][j] = value + } + } + // d.removeUnusedTerms() + return &d +} + +// removeUnusedTerms clears unused terms from the term list +func (d *Dataset) removeUnusedTerms() { + // find unused terms + var termsToRemove []string + for _, term := range d.terms { + if !notAllSame(d.data[term]) { + termsToRemove = append(termsToRemove, term) + } + } + + // clear them + for _, term := range termsToRemove { + if Include(d.terms, term) { + d.terms = Remove(d.terms, term) + } + } +} + +func ImportDatasets(paths []string) map[string]*Dataset { + datasets := make(map[string]*Dataset) + + for _, file := range paths { + // fmt.Println(file) + datasets[file] = ImportXLSX(file) + } + return datasets +} + +func exportDataset(d *Dataset, name string) { + f := excelize.NewFile() + sheetName := f.GetSheetMap()[1] + + for i, name := range d.terms { + // write the term + coord, err := excelize.CoordinatesToCellName(i+1,1 ) + if err != nil { + log.Fatal("Trouble getting coaordinates:", err) + } + f.SetCellStr(sheetName,coord, name) + + // write the rest of the column + for j, value := range d.data[name] { + coord, err := excelize.CoordinatesToCellName(i+1, j+2) + if err != nil { + log.Fatal("Trouble getting coaordinates:", err) + } + f.SetCellStr(sheetName, coord, value) + } + } + + err := f.SaveAs("merged.xlsx") + if err != nil { + log.Fatal("Unable to save file", err) + } + +} + diff --git a/main.go b/main.go @@ -0,0 +1,74 @@ +package main + +import ( + "fmt" + "os" + "log" + "path/filepath" +) + +func getExcelFiles() []string { + var files []string + + path, err := os.Getwd() + if err != nil { + log.Println(err) + } + + fmt.Println("Attempting to merge Microsoft Excel files in ", path) + + err = filepath.Walk(path, func(path string, info os.FileInfo, err error) error { + if info.IsDir() { + return nil + } + if filepath.Base(path) == "merged.xlsx" { // ignore our output file + return nil + } + if filepath.Ext(path) == ".xlsx" || filepath.Ext(path) == ".XLSX" { + files = append(files, path) + } + return nil + }) + if err != nil { + panic(err) + } + return files +} + +func mergeDatasets(base *Dataset, new *Dataset) { + numOld := base.height - 1 + numNew := new.height - 1 // number of specimens being added + + for _, term := range new.terms { + if !Include(base.terms, term) { + base.terms = append(base.terms, term) + base.data[term] = make([]string, numOld) + } + + } + + for _, term := range base.terms { + base.data[term] = append(base.data[term], new.data[term]...) + } + + base.height += numNew +} + +func main() { + // get list of excel files + files := getExcelFiles() + + // import them all into Datasets + datasets := ImportDatasets(files) + + // merge them into a single Dataset + var dataset Dataset + dataset.height = 1 // row of terms, even though it's empty + dataset.data = make(map[string][]string) + for _, d := range datasets { + mergeDatasets(&dataset,d) + } + + // export dataset + exportDataset(&dataset, "merged.xlsx") +} diff --git a/winbuild.sh b/winbuild.sh @@ -0,0 +1,2 @@ +#!/bin/bash +GOOS=windows GOARCH=amd64 go build