commit 58356b34ac917bd7084566c48ea681d25f28e9ca
parent 151b56db47f3dfaf48024eb60c93e61c55d5fcf8
Author: Nick Econopouly <wry@mm.st>
Date: Thu, 5 Mar 2020 18:30:15 -0500
Fix silly bug in mergeDatasets
Diffstat:
M | import.go | | | 43 | ++++++++++++------------------------------- |
M | main.go | | | 155 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------- |
2 files changed, 141 insertions(+), 57 deletions(-)
diff --git a/import.go b/import.go
@@ -11,38 +11,19 @@ type Dataset struct {
data map[string][]string // maps terms to ordered data (columns)
terms []string // ordered list of terms
name string // name of the dataset
- height int // height of the XLSX file (including the first row!)
+ height int // height of the XLSX file (including the first row of terms)
}
-// ImportXLSX imports an XLSX file. It takes filename, sheet name,
-// name for the Dataset internally, and height in rows as arguments
-// and returns a *Dataset.
-func ImportXLSX(filename string) *Dataset {
+func ImportDataset(name string, rows [][]string) *Dataset {
+ fmt.Println("name is ", name)
// Initialize Dataset
var d Dataset
d.data = make(map[string][]string)
- d.name = filename
- // d.height = height
- // pull data from file
- f, err := excelize.OpenFile(filename)
- if err != nil {
- fmt.Println("Problem importing ", filename, err)
- return &d
- }
-
- sheetName := f.GetSheetMap()[1]
- fmt.Println(sheetName)
-
- rows, err := f.GetRows(sheetName)
-
- if err != nil {
- fmt.Println("Cannot read XLSX data in", filename, err.Error())
- return &d
- }
d.height = len(rows)
d.terms = rows[0]
+ d.name = name
// Create blank column for each term
for _, term := range d.terms {
@@ -77,15 +58,15 @@ func (d *Dataset) removeUnusedTerms() {
}
}
-func ImportDatasets(paths []string) map[string]*Dataset {
- datasets := make(map[string]*Dataset)
+// func ImportDatasets(paths []string) map[string]*Dataset {
+// datasets := make(map[string]*Dataset)
- for _, file := range paths {
- // fmt.Println(file)
- datasets[file] = ImportXLSX(file)
- }
- return datasets
-}
+// for _, file := range paths {
+// // fmt.Println(file)
+// datasets[file] = ImportXLSX(file)
+// }
+// return datasets
+// }
func exportDataset(d *Dataset, name string) {
f := excelize.NewFile()
diff --git a/main.go b/main.go
@@ -4,62 +4,165 @@ import (
"fmt"
"os"
"log"
- "path/filepath"
+ "path/filepath"
+ "strings"
+ "github.com/360EntSecGroup-Skylar/excelize"
+ "github.com/knieriem/odf/ods"
+ "encoding/csv"
)
-func getExcelFiles() []string {
- var files []string
+const EXPORT_BASE_FILENAME = "merged"
+
+func pullExcel(path string) [][]string {
+ var rows [][]string
+ f, err := excelize.OpenFile(path)
+ if err != nil {
+ fmt.Println("Problem importing ", path, err)
+ return rows
+ }
+
+ sheetName := f.GetSheetMap()[1]
+ // fmt.Println(sheetName)
+
+ rows, err = f.GetRows(sheetName)
+ if err != nil {
+ fmt.Println("Problem getting rows from ", path, err)
+ }
+ return rows
+}
+
+func pullCSV(path string) [][]string {
+ var rows [][]string
- path, err := os.Getwd()
+ f, err := os.Open(path)
if err != nil {
- log.Println(err)
+ fmt.Printf("Cannot open '%s': %s\n", path, err.Error())
+ return rows
}
+ defer f.Close()
- fmt.Println("Attempting to merge Microsoft Excel files in ", path)
+ // TODO do I need to close the reader?
+ r := csv.NewReader(f)
+ r.LazyQuotes = true
+ rows, err = r.ReadAll()
- err = filepath.Walk(path, func(path string, info os.FileInfo, err error) error {
- if info.IsDir() {
- return nil
- }
- if filepath.Base(path) == "merged.xlsx" { // ignore our output file
- return nil
- }
- if filepath.Ext(path) == ".xlsx" || filepath.Ext(path) == ".XLSX" {
- files = append(files, path)
- }
- return nil
- })
if err != nil {
- panic(err)
+ fmt.Println("trouble reading rows in ", path)
+ return rows
}
- return files
+ return rows
+}
+
+func pullODS(path string) [][]string {
+ var rows [][]string
+
+ f, err := ods.Open(path)
+ if err != nil {
+ fmt.Println("trouble reading ods file", path,":",err)
+ return rows
+ }
+
+ var d ods.Doc
+
+ err = f.ParseContent(&d)
+
+ if err != nil {
+ fmt.Println("trouble parsing ods file", path,":",err)
+ return rows
+ }
+
+ rows = d.Table[0].Strings()
+ return rows
+
}
func mergeDatasets(base *Dataset, new *Dataset) {
numOld := base.height - 1
- numNew := new.height - 1 // number of specimens being added
+ numNew := new.height - 1 // number of specimens being added
+
+ // on the first merge the base dataset is empty (no terms or data)
+ if !Include(base.terms, "DATASET") {
+ base.terms = append(base.terms, "DATASET")
+ }
+
+ // add DATASET term
+ new.terms = append(new.terms, "DATASET")
+ new.data["DATASET"] = make([]string, numNew)
+ for i, _ := range new.data["DATASET"] {
+ new.data["DATASET"][i] = new.name
+ }
+ // fmt.Println(new.name)
+ // add blank data for new terms not present in the old dataset
for _, term := range new.terms {
if !Include(base.terms, term) {
+ fmt.Println("base.terms is ", base.terms, " and term is", term)
base.terms = append(base.terms, term)
base.data[term] = make([]string, numOld)
}
+ }
+ // add blank data for old terms not present in the new dataset
+ for _, term := range base.terms {
+ if !Include(new.terms, term) {
+ fmt.Println("new.terms is ", new.terms, " and term is", term)
+ new.terms = append(new.terms, term)
+ new.data[term] = make([]string, numNew)
+ }
}
+ // datasets are equalized (no new data, just blank terms where
+ // appropriate) so merging them is a simple append
for _, term := range base.terms {
- base.data[term] = append(base.data[term], new.data[term]...)
+ base.data[term] = append(base.data[term], new.data[term]...)
}
base.height += numNew
}
func main() {
- // get list of excel files
- files := getExcelFiles()
- // import them all into Datasets
- datasets := ImportDatasets(files)
+ // get set up with the current directory
+ wd, err := os.Getwd()
+ if err != nil {
+ log.Println(err)
+ }
+
+ // supported file extensions and their associated function for pulling the raw data
+ fileFormat := map[string]func(string) [][]string{
+ ".xlsx": pullExcel,
+ ".csv": pullCSV,
+ ".ods": pullODS,
+ }
+
+ fmt.Println("Attempting to merge spreadsheet files in ", wd)
+
+ // map of basenames to the [][]string data
+ raws := make(map[string][][]string)
+
+ err = filepath.Walk(wd, func(path string, info os.FileInfo, err error) error {
+ if info.IsDir() {
+ return nil
+ }
+ basename := strings.TrimSuffix(filepath.Base(path), filepath.Ext(path))
+ ext := strings.ToLower(filepath.Ext(path))
+
+ if basename == EXPORT_BASE_FILENAME { // ignore our output file
+ return nil
+ }
+ if _, ok := fileFormat[ext]; ok {
+ raws[basename] = fileFormat[ext](path)
+ }
+ return nil
+ })
+ if err != nil {
+ panic(err)
+ }
+
+ datasets := make(map[string]*Dataset)
+ for name, data := range raws {
+ datasets[name] = ImportDataset(name, data)
+ }
// merge them into a single Dataset
var dataset Dataset