Commit 16e6e398 authored by Lysander Trischler's avatar Lysander Trischler

Add minimal classification support for unknown UAs

parent d0ed95be
......@@ -65,6 +65,23 @@ Example Usage
32900 @somebody → https://example.com/twtxt.txt
34 http://example.com/whoFollows?followers=8&token=gLvOWbFYT
Analyzing Unknown ``User-Agent``\ s
-----------------------------------
The ``-u`` or ``--show-unknown`` flag prints all the unknown ``User-Agent``\ s
with their occurrences to stdout, too. This way the program can be further
tuned and new twtxt formats built into it.
in raw form with their occurrences to stdout, too. This way the program can be
further tuned and new twtxt formats built into it.
The ``-c`` or ``--classify-unknown`` flag classifies and prints all the unkown
``User-Agent``\ s with their occurrences to stdout. It uses a very simple
builtin mapping table for Twtxt clients, bots, libraries and browsers. Please
note: this mapping is far from complete and even may not produce correct
classifications. This flag implies ``-u``/``--show-unknown``.
The ``-g`` or ``--group-unknown`` flag groups and prints all the classified
unknown ``User-Agent``\ s with their occurrences to stdout. This is similar to
``-c``/``--classify-unknown``, except that the output is even more condensed
and only the groups of the classified agents are shown. Groups are ``Twtxt
client``, ``Bots``, ``Libraries``, ``Browsers``, ``-`` (if empty) and ``(???)``
(for unmapped agents). This flag implies both ``-uc`` or ``--show-unknown
--classify-unknown``.
......@@ -11,7 +11,15 @@ import (
func main() {
showUnknown := flag.BoolP("show-unknown", "u", false, "show all unkown User-Agents for further analysis")
classifyUnknown := flag.BoolP("classify-unknown", "c", false, "classify all unknown User-Agents for further analysis; implies -u")
groupUnknown := flag.BoolP("group-unknown", "g", false, "group all classified unknown User-Agents for further analysis; implies both -u and -c")
flag.Parse()
if *groupUnknown {
*classifyUnknown = true
}
if *classifyUnknown {
*showUnknown = true
}
twtxtUAs, nonTwtxtUAs := 0, 0
......@@ -75,8 +83,28 @@ func main() {
if *showUnknown && len(nonTwtxtUACounter) > 0 {
fmt.Println(strings.Repeat("=", 80))
for ua, cnt := range nonTwtxtUACounter {
fmt.Printf("%5d %s\n", cnt, ua)
if *classifyUnknown {
// map classified User-Agent to number of occurrences
classified := make(map[string]int)
for ua, cnt := range nonTwtxtUACounter {
agent, group := ClassifyNonTwtxtUserAgent(ua)
if *groupUnknown {
classified[group] += cnt
} else {
classified[agent] += cnt
}
}
printCounter(classified)
} else {
printCounter(nonTwtxtUACounter)
}
}
}
func printCounter(counter map[string]int) {
for ua, cnt := range counter {
fmt.Printf("%5d %s\n", cnt, ua)
}
}
......@@ -91,3 +91,39 @@ func setNicksAndURLs(ua *UserAgent, followerNicks, referenceURL string) {
}
}
}
type classifier struct {
group string
agents []string
}
var classifiers = []classifier{
{"Twtxt clients", []string{"twtxt", "txtnish"}},
{"Bots", []string{"AhrefsBot", "AppleBot", "Barkrowler", "CCBot", "DotBot", "Googlebot", "ltx71"}},
{"Librarys", []string{"curl", "Go-http-client", "ruby", "Wget"}},
{"Browsers", []string{"Safari", "Firefox", "Mozilla"}},
}
var twtdRegex = regexp.MustCompile(`^twtxt/[\w.@_/-]+ \(Pod: ` + uri + ` Support: (` + uri + `)\)$`)
func ClassifyNonTwtxtUserAgent(userAgent string) (string, string) {
if userAgent == "" || userAgent == "-" {
return "-", "-"
}
// twtxt/0.1.0@0f0603f (Pod: nfld Support: http://twt.nfld.uk/support)
twtd := twtdRegex.FindStringSubmatch(userAgent)
if len(twtd) == 2 {
return "twtd " + twtd[1], "Twtxt clients"
}
for _, classifier := range classifiers {
for _, agent := range classifier.agents {
if strings.Contains(userAgent, agent) {
return agent, classifier.group
}
}
}
return "(???) " + userAgent, "(???)"
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment