Skip to content
Snippets Groups Projects
scraper.go 4.71 KiB
Newer Older
  • Learn to ignore specific revisions
  •  * Copyright 2017-2018 Dgraph Labs, Inc.
    
    Manish R Jain's avatar
    Manish R Jain committed
     * This file is available under the Apache License, Version 2.0,
     * with the Commons Clause restriction.
    
    package main
    
    import (
    	"bufio"
    	"fmt"
    	"net/http"
    	"os"
    
    	"strings"
    
    	"github.com/MakeNowJust/heredoc"
    	"github.com/tebeka/snowball"
    )
    
    
    // Simple tool to get the lists of stopwords.
    // Source of stopwords: https://github.com/6/stopwords-json (license: Apache 2.0)
    
    	fn := "/tmp/stopwords.go.generated"
    	f, _ := os.Create(fn)
    
    	w := bufio.NewWriter(f)
    	w.WriteString(heredoc.Doc(`
    		/*
    
    		 * Copyright 2017-2018 Dgraph Labs, Inc.
    
    		 * This file is available under the Apache License, Version 2.0,
    		 * with the Commons Clause restriction.
    
    		 */
    
    		package tok
    
    		// CODE GENERATED BY contrib/stopwords
    		// DO NOT EDIT!
    
    
    		// Source of stopwords: https://github.com/6/stopwords-json (license: Apache 2.0)
    
    		// Awailable languages:
    		//   danish, dutch, english, finnish, french, german, hungarian, italian, norwegian, portuguese,
    		//   romanian, russian, spanish, swedish, turkish
    		var stopwords = map[string][]interface{}{
    	`))
    	for _, lang := range snowball.LangList() {
    		if lang == "porter" {
    			continue
    		}
    		fmt.Println(lang)
    
    		ln := getLangCode(lang)
    
    		url := "https://raw.githubusercontent.com/6/stopwords-json/master/dist/" + ln + ".json"
    
    		resp, err := http.Get(url)
    		if err != nil {
    			return
    		}
    		defer resp.Body.Close()
    
    		status := resp.StatusCode
    		if 200 <= status && status < 300 {
    			// conditional allow
    			scanner := bufio.NewScanner(resp.Body)
    			w.WriteRune('"')
    			w.WriteString(lang)
    			w.WriteString("\": {")
    			for scanner.Scan() {
    				w.WriteString(strings.Trim(scanner.Text(), "[]"))
    			}
    			w.WriteString("},\n")
    			if err := scanner.Err(); err != nil {
    			}
    		}
    	}
    	w.WriteString("}\n")
    	w.Flush()
    
    
    	fmt := exec.Command("gofmt", "-w", fn)
    	err := fmt.Start()
    	if err == nil {
    		fmt.Wait()
    	}
    
    }
    
    func getLangCode(lang string) string {
    	// List based on https://godoc.org/golang.org/x/text/language#Tag
    	// It contains more languages than supported by Bleve, to enable seamless addition of new langs.
    	mapping := map[string]string{
    		"afrikaans":            "af",
    		"amharic":              "am",
    		"arabic":               "ar",
    		"modernstandardarabic": "ar-001",
    		"azerbaijani":          "az",
    		"bulgarian":            "bg",
    		"bengali":              "bn",
    		"catalan":              "ca",
    		"czech":                "cs",
    		"danish":               "da",
    		"german":               "de",
    		"greek":                "el",
    		"english":              "en",
    		"americanenglish":      "en-us",
    		"britishenglish":       "en-gb",
    		"spanish":              "es",
    		"europeanspanish":      "es-es",
    		"latinamericanspanish": "es-419",
    		"estonian":             "et",
    		"persian":              "fa",
    		"finnish":              "fi",
    		"filipino":             "fil",
    		"french":               "fr",
    		"canadianfrench":       "fr-ca",
    		"gujarati":             "gu",
    		"hebrew":               "he",
    		"hindi":                "hi",
    		"croatian":             "hr",
    		"hungarian":            "hu",
    		"armenian":             "hy",
    		"indonesian":           "id",
    		"icelandic":            "is",
    		"italian":              "it",
    		"japanese":             "ja",
    		"georgian":             "ka",
    		"kazakh":               "kk",
    		"khmer":                "km",
    		"kannada":              "kn",
    		"korean":               "ko",
    		"kirghiz":              "ky",
    		"lao":                  "lo",
    		"lithuanian":           "lt",
    		"latvian":              "lv",
    		"macedonian":           "mk",
    		"malayalam":            "ml",
    		"mongolian":            "mn",
    		"marathi":              "mr",
    		"malay":                "ms",
    		"burmese":              "my",
    		"nepali":               "ne",
    		"dutch":                "nl",
    		"norwegian":            "no",
    		"punjabi":              "pa",
    		"polish":               "pl",
    		"portuguese":           "pt",
    		"brazilianportuguese":  "pt-br",
    		"europeanportuguese":   "pt-pt",
    		"romanian":             "ro",
    		"russian":              "ru",
    		"sinhala":              "si",
    		"slovak":               "sk",
    		"slovenian":            "sl",
    		"albanian":             "sq",
    		"serbian":              "sr",
    		"serbianlatin":         "sr-latn",
    		"swedish":              "sv",
    		"swahili":              "sw",
    		"tamil":                "ta",
    		"telugu":               "te",
    		"thai":                 "th",
    		"turkish":              "tr",
    		"ukrainian":            "uk",
    		"urdu":                 "ur",
    		"uzbek":                "uz",
    		"vietnamese":           "vi",
    		"chinese":              "zh",
    		"simplifiedchinese":    "zh-hans",
    		"traditionalchinese":   "zh-hant",
    		"zulu":                 "zu",
    	}
    
    	code, ok := mapping[lang]
    	if ok {
    		return code
    	}
    
    	panic("Unsupported language: " + lang)