Commit c1b6d041 authored by Lysander Trischler's avatar Lysander Trischler

Implement User-Agent analyzer

parent e306d598
......@@ -3,4 +3,4 @@
*.pyc
*.pyo
__pycache__/
useragent/useragent
all: test useragent
test:
go test -v -cover -race ./...
useragent:
go build
module useragent
go 1.15
require github.com/stretchr/testify v1.6.1
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.6.1 h1:hDPOHmpOpP40lSULcqw7IrRb/u7w6RpDC9399XyoNd0=
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
package main
import (
"bufio"
"fmt"
"net/url"
"os"
)
func main() {
twtxtUAs, nonTwtxtUAs := 0, 0
// map URL to nick
singleUsers := make(map[string]string)
// map URL of single user or hostname of Who Follows Resource to number of occurences
urlCounter := make(map[string]int)
// map Who Follows Resource hostname to last full URL
whoFollowsResources := make(map[string]string)
scanner := bufio.NewScanner(os.Stdin)
for scanner.Scan() {
line := scanner.Text()
ua := ParseUserAgent(ExtractUserAgentFromNginxAccessLog(line))
if ua.IsTwtxt() {
twtxtUAs++
if ua.TwtxtURL != "" {
singleUsers[ua.TwtxtURL] = ua.TwtxtNick
urlCounter[ua.TwtxtURL]++
} else if ua.WhoFollowsURL != "" {
u, err := neturl.Parse(ua.WhoFollowsURL)
if err != nil {
fmt.Printf("ERROR: Invalid Who Follows Resource: %v\n", err)
urlCounter["ERROR"]++
continue
}
hostname := u.Hostname()
whoFollowsResources[hostname] = ua.WhoFollowsURL
urlCounter[hostname]++
}
} else {
nonTwtxtUAs++
}
}
if err := scanner.Err(); err != nil {
fmt.Println(err)
}
fmt.Printf("Twtxt UAs: %d Non-Twtxt UAs: %d\n", twtxtUAs, nonTwtxtUAs)
for url, nick := range singleUsers {
fmt.Printf("%5d @%v → %s\n", urlCounter[url], nick, url)
}
for hostname, url := range whoFollowsResources {
fmt.Printf("%5d %s\n", urlCounter[hostname], url)
}
}
package main
import (
"net/url"
"regexp"
"strings"
)
type UserAgent struct {
ClientName string
ClientVersion string
TwtxtURL string
TwtxtNick string
WhoFollowsURL string
}
func (ua UserAgent) IsTwtxt() bool {
return ua.TwtxtURL != "" || ua.TwtxtNick != "" || ua.WhoFollowsURL != ""
}
func ExtractUserAgentFromNginxAccessLog(line string) string {
// Let's exploit the fact that Nginx hex-encodes literal quotes in both URL
// and User-Agent header in its access logs. So we can simply search for the
// literal quotes. Sample lines from an Nginx access log:
//
// 2021-02-06T08:59:30+01:00 "GET /twtxt.txt HTTP/1.1" 304 "twtxt/1.2.3"
// 2021-02-06T10:52:50+01:00 "GET /twtxt.txt?lol=\x22quotes\x22 HTTP/1.1" 200 "\x22quotes\x22"
// find the start of the user agent
idx := strings.LastIndex(line[:len(line)-1], `"`)
// strip the quotes surrounding the User-Agent string
return line[idx+1 : len(line)-1]
}
const uri = `[\w:/._?&=@-]+`
var clientRegex = regexp.MustCompile(`(\w+)/([\w.@_/-]+)`)
var singleUserRegex = regexp.MustCompile(`\(\+(` + uri + `);\s*@(\w+)\)`)
var multiUserRegex = regexp.MustCompile(`\(~(` + uri + `)(;\s*contact=` + uri + `)?\)`)
var oldMultiUser25Regex = regexp.MustCompile(`\(Pod: ` + uri + ` Followers: ([\w\s]+) Support: (` + uri + `)\)`)
var oldMultiUser6Regex = regexp.MustCompile(`\(Pod: ` + uri + ` Followers: ([\w\s]+) and \d+ more... (` + uri + `) Support: ` + uri + `\)`)
// ParseUserAgent parses a given User-Agent string into its relevant pieces.
func ParseUserAgent(userAgent string) UserAgent {
ua := UserAgent{}
client := clientRegex.FindStringSubmatch(userAgent)
if len(client) == 3 {
ua.ClientName = client[1]
ua.ClientVersion = client[2]
}
singleUser := singleUserRegex.FindStringSubmatch(userAgent)
if len(singleUser) == 3 {
ua.TwtxtURL = singleUser[1]
ua.TwtxtNick = singleUser[2]
return ua
}
multiUser := multiUserRegex.FindStringSubmatch(userAgent)
if len(multiUser) >= 2 {
ua.WhoFollowsURL = multiUser[1]
return ua
}
oldMultiUser6 := oldMultiUser6Regex.FindStringSubmatch(userAgent)
if len(oldMultiUser6) == 3 {
// TODO generate known nicks and URLs
ua.WhoFollowsURL = oldMultiUser6[2]
return ua
}
oldMultiUser25 := oldMultiUser25Regex.FindStringSubmatch(userAgent)
if len(oldMultiUser25) == 3 {
u, err := url.Parse(oldMultiUser25[2])
if err != nil {
return ua
}
a := u
u = a
// TODO generate known nicks and URLs
return ua
}
return ua
}
package main
import (
"github.com/stretchr/testify/assert"
"testing"
)
func TestExtractUserAgentFromNginxAccessLog(t *testing.T) {
assert := assert.New(t)
testCases := []struct {
name string
line string
expected string
}{
{
name: "Client and version only",
line: `2021-02-06T08:59:30+01:00 "GET /twtxt.txt HTTP/1.1" 304 "twtxt/1.2.3"`,
expected: `twtxt/1.2.3`,
},
{
name: "Single user client User-Agent",
line: `2021-02-06T07:34:54+01:00 "GET /twtxt.txt HTTP/1.1" 304 "twtxt/0.1.0@ff7e288 (+https://txt.sour.is/user/xuu/twtxt.txt; @xuu)"`,
expected: `twtxt/0.1.0@ff7e288 (+https://txt.sour.is/user/xuu/twtxt.txt; @xuu)`,
},
{
name: "Multi user client User-Agent",
line: `2021-02-06T09:26:47+01:00 "GET /twtxt.txt HTTP/1.1" 304 "twtxt/0.1.0@35cbec4 (~https://twtxt.net/whoFollows?followers=9&token=3VFp9uq2B; contact=https://twtxt.net/support)"`,
expected: `twtxt/0.1.0@35cbec4 (~https://twtxt.net/whoFollows?followers=9&token=3VFp9uq2B; contact=https://twtxt.net/support)`,
},
{
name: "Ruby User-Agent",
line: `2021-02-06T03:02:58+01:00 "HEAD /twtxt.txt HTTP/1.1" 200 "rest-client/2.0.0 (linux-gnu x86_64) ruby/2.2.6p396"`,
expected: `rest-client/2.0.0 (linux-gnu x86_64) ruby/2.2.6p396`,
},
{
name: "Escaped quotes in User-Agent",
line: `2021-02-06T10:52:50+01:00 "GET /twtxt.txt?lol=\x22quotes\x22 HTTP/1.1" 200 "\x22quotes\x22"`,
expected: `\x22quotes\x22`,
},
}
for _, testCase := range testCases {
t.Run(testCase.name, func(t *testing.T) {
assert.Equal(testCase.expected, ExtractUserAgentFromNginxAccessLog(testCase.line))
})
}
}
func TestParseUserAgent(t *testing.T) {
assert := assert.New(t)
testCases := []struct {
name string
userAgent string
expected UserAgent
}{
{
name: "Random bot User-Agent",
userAgent: "Mozilla/5.0 (compatible; Barkrowler/0.9; +https://babbar.tech/crawler)",
expected: UserAgent{
ClientName: "Mozilla",
ClientVersion: "5.0",
TwtxtURL: "",
TwtxtNick: "",
WhoFollowsURL: "",
},
},
{
name: "Single user client User-Agent",
userAgent: "twtxt/0.1.0@ec2f019 (+https://txt.sour.is/user/xuu/twtxt.txt; @xuu)",
expected: UserAgent{
ClientName: "twtxt",
ClientVersion: "0.1.0@ec2f019",
TwtxtURL: "https://txt.sour.is/user/xuu/twtxt.txt",
TwtxtNick: "xuu",
WhoFollowsURL: "",
},
},
{
name: "Multi user client User-Agent with Who Follows Resource and contact information",
userAgent: "twtxt/0.1.0@35cbec4 (~https://twtxt.net/whoFollows?followers=9&token=3VFp9uq2B; contact=https://twtxt.net/support)",
expected: UserAgent{
ClientName: "twtxt",
ClientVersion: "0.1.0@35cbec4",
TwtxtURL: "",
TwtxtNick: "",
WhoFollowsURL: "https://twtxt.net/whoFollows?followers=9&token=3VFp9uq2B",
},
},
{
name: "Multi user client User-Agent with Who Follows Resource but without contact information",
userAgent: "twtxt/0.1.0@35cbec4 (~https://twtxt.net/whoFollows?followers=9&token=3VFp9uq2B)",
expected: UserAgent{
ClientName: "twtxt",
ClientVersion: "0.1.0@35cbec4",
TwtxtURL: "",
TwtxtNick: "",
WhoFollowsURL: "https://twtxt.net/whoFollows?followers=9&token=3VFp9uq2B",
},
},
{
name: "Old multi user client User-Agent with two to five followers",
userAgent: "twtxt/0.1.0@69ac73b (Pod: nfld Followers: alice jlj Support: https://twt.nfld.uk/support)",
expected: UserAgent{
ClientName: "twtxt",
ClientVersion: "0.1.0@69ac73b",
TwtxtURL: "",
TwtxtNick: "",
WhoFollowsURL: "",
},
},
{
name: "Old multi user client User-Agent with six or more followers",
userAgent: "twtxt/0.1.0@37fd365 (Pod: twtxt.net Followers: adi antonio darch gareppa ionores and 3 more... https://twtxt.net/whoFollows?uri=https://lyse.isobeef.org/twtxt.txt&nick=lyse&token=OzcdPbe6Z Support: https://twtxt.net/support)",
expected: UserAgent{
ClientName: "twtxt",
ClientVersion: "0.1.0@37fd365",
TwtxtURL: "",
TwtxtNick: "",
WhoFollowsURL: "https://twtxt.net/whoFollows?uri=https://lyse.isobeef.org/twtxt.txt&nick=lyse&token=OzcdPbe6Z",
},
},
{
name: "No User-Agent",
userAgent: "",
expected: UserAgent{
ClientName: "",
ClientVersion: "",
TwtxtURL: "",
TwtxtNick: "",
WhoFollowsURL: "",
// TODO
},
},
}
for _, testCase := range testCases {
t.Run(testCase.name, func(t *testing.T) {
assert.Equal(testCase.expected, ParseUserAgent(testCase.userAgent))
})
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment