Commit 4eae1732 authored by Lysander Trischler's avatar Lysander Trischler

Refactor hash interface for future reuse

This refactoring also uncovered some kind of bug, that got masked by the
`twtxt.models.Tweet` constructor, which did the actual conversion to
seconds precision. Without that Tweet container in between anymore, two
precision-related tests failed. So this precision handling needed to be
replicated into the hash creation – where it acutally belongs. One day,
we're finally getting rid of the `twtxt.models.Tweet` entirely, so it's
not really duplicated anymore.

This is part of surgically removing the twtxt reference implementation
of tt.
parent 94e1d640
...@@ -9,50 +9,50 @@ CET = datetime.timezone(datetime.timedelta(hours=1)) ...@@ -9,50 +9,50 @@ CET = datetime.timezone(datetime.timedelta(hours=1))
class CreateHashTest(unittest.TestCase): class CreateHashTest(unittest.TestCase):
def test_old_timestamp_format(self): def test_old_timestamp_format(self):
self.assertEqual("sqwl3la", create_old_hash(twtxt.models.Tweet( self.assertEqual("sqwl3la", create_old_hash(
created_at=datetime.datetime(2020, 12, 6, 20, 20, 35, tzinfo=CET), created_at=datetime.datetime(2020, 12, 6, 20, 20, 35, tzinfo=CET),
text="This is a test tweet for testing.", text="This is a test tweet for testing.",
source=twtxt.models.Source(nick="nick", url="http://0.0.0.0:8000/user/lyse/twtxt.txt")))) source=twtxt.models.Source(nick="nick", url="http://0.0.0.0:8000/user/lyse/twtxt.txt")))
def test_rfc3339_timestamp_with_milliseconds_precision_is_truncated_to_seconds_precision(self): def test_rfc3339_timestamp_with_milliseconds_precision_is_truncated_to_seconds_precision(self):
self.assertEqual("74qtyjq", create_hash(twtxt.models.Tweet( self.assertEqual("74qtyjq", create_hash(
created_at=datetime.datetime(2020, 12, 9, 15, 38, 42, 123, tzinfo=UTC), created_at=datetime.datetime(2020, 12, 9, 15, 38, 42, 123, tzinfo=UTC),
text="The twt hash now uses the RFC 3339 timestamp format.", text="The twt hash now uses the RFC 3339 timestamp format.",
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))) source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))
def test_rfc3339_timestamp_with_millisecomds_precision_is_truncated_to_seconds_precision_without_rounding(self): def test_rfc3339_timestamp_with_millisecomds_precision_is_truncated_to_seconds_precision_without_rounding(self):
self.assertEqual("74qtyjq", create_hash(twtxt.models.Tweet( self.assertEqual("74qtyjq", create_hash(
created_at=datetime.datetime(2020, 12, 9, 15, 38, 42, 999, tzinfo=UTC), created_at=datetime.datetime(2020, 12, 9, 15, 38, 42, 999, tzinfo=UTC),
text="The twt hash now uses the RFC 3339 timestamp format.", text="The twt hash now uses the RFC 3339 timestamp format.",
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))) source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))
def test_rfc3339_timestamp_with_seconds_precision_and_utc_plus_1_offset_is_kept_intact(self): def test_rfc3339_timestamp_with_seconds_precision_and_utc_plus_1_offset_is_kept_intact(self):
self.assertEqual("64u2m5a", create_hash(twtxt.models.Tweet( self.assertEqual("64u2m5a", create_hash(
created_at=datetime.datetime(2020, 12, 9, 16, 38, 42, tzinfo=CET), created_at=datetime.datetime(2020, 12, 9, 16, 38, 42, tzinfo=CET),
text="The twt hash now uses the RFC 3339 timestamp format.", text="The twt hash now uses the RFC 3339 timestamp format.",
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))) source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))
def test_rfc3339_timestamp_with_minutes_precision_is_expanded_to_seconds_precision(self): def test_rfc3339_timestamp_with_minutes_precision_is_expanded_to_seconds_precision(self):
self.assertEqual("a3c3k5q", create_hash(twtxt.models.Tweet( self.assertEqual("a3c3k5q", create_hash(
created_at=datetime.datetime(2020, 12, 9, 16, 38, 0, tzinfo=CET), created_at=datetime.datetime(2020, 12, 9, 16, 38, tzinfo=CET),
text="The twt hash now uses the RFC 3339 timestamp format.", text="The twt hash now uses the RFC 3339 timestamp format.",
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))) source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))
def test_rfc3339_timestamp_with_utc_is_rendered_as_designated_zulu_offset_rather_than_numeric_offset(self): def test_rfc3339_timestamp_with_utc_is_rendered_as_designated_zulu_offset_rather_than_numeric_offset(self):
self.assertEqual("74qtyjq", create_hash(twtxt.models.Tweet( self.assertEqual("74qtyjq", create_hash(
created_at=datetime.datetime(2020, 12, 9, 15, 38, 42, tzinfo=UTC), created_at=datetime.datetime(2020, 12, 9, 15, 38, 42, tzinfo=UTC),
text="The twt hash now uses the RFC 3339 timestamp format.", text="The twt hash now uses the RFC 3339 timestamp format.",
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))) source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))
self.assertEqual("74qtyjq", create_hash(twtxt.models.Tweet( self.assertEqual("74qtyjq", create_hash(
created_at=datetime.datetime(2020, 12, 9, 15, 38, 42, tzinfo=datetime.timezone(datetime.timedelta(hours=-0))), created_at=datetime.datetime(2020, 12, 9, 15, 38, 42, tzinfo=datetime.timezone(datetime.timedelta(hours=-0))),
text="The twt hash now uses the RFC 3339 timestamp format.", text="The twt hash now uses the RFC 3339 timestamp format.",
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))) source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))
def test_rfc3339_timestamp_without_explicit_timezone_information_is_assumed_to_be_in_utc(self): def test_rfc3339_timestamp_without_explicit_timezone_information_is_assumed_to_be_in_utc(self):
self.assertEqual("74qtyjq", create_hash(twtxt.models.Tweet( self.assertEqual("74qtyjq", create_hash(
created_at=datetime.datetime(2020, 12, 9, 15, 38, 42), created_at=datetime.datetime(2020, 12, 9, 15, 38, 42),
text="The twt hash now uses the RFC 3339 timestamp format.", text="The twt hash now uses the RFC 3339 timestamp format.",
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))) source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -20,7 +20,7 @@ _tz_abbr_cache = { ...@@ -20,7 +20,7 @@ _tz_abbr_cache = {
} }
def create_hash(twt): def create_hash(created_at, text, source):
""" """
Create the hash of the given twt as specified in the Twt Hash Extension: Create the hash of the given twt as specified in the Twt Hash Extension:
https://dev.twtxt.net/doc/twthashextension.html https://dev.twtxt.net/doc/twthashextension.html
...@@ -29,23 +29,30 @@ def create_hash(twt): ...@@ -29,23 +29,30 @@ def create_hash(twt):
# (All comments in this function are taken from the aforementioned # (All comments in this function are taken from the aforementioned
# specification. Comments in parenthesis are mine, including this one.) # specification. Comments in parenthesis are mine, including this one.)
# The time must exactly be truncated or expanded to seconds precision. Any
# possible milliseconds must be cut off without any rounding. The seconds
# part of minutes precision times must be set to zero.
# (There is no millisecond keyword, we have to use microsecond, but it does
# exactly what we want.)
sec_precision_created_at = created_at.replace(microsecond=0)
# If the timestamp does not explicitly include any timezone information, it # If the timestamp does not explicitly include any timezone information, it
# must be assumed to be in UTC. # must be assumed to be in UTC.
if twt.created_at.tzinfo is None: if created_at.tzinfo is None:
tz_aware_created_at = twt.created_at.replace(tzinfo=datetime.timezone.utc) tz_aware_created_at = sec_precision_created_at.replace(tzinfo=datetime.timezone.utc)
else: else:
tz_aware_created_at = twt.created_at tz_aware_created_at = sec_precision_created_at
# All timezones representing UTC must be formatted using the designated # All timezones representing UTC must be formatted using the designated
# Zulu indicator 'Z' rather than the numeric offsets '+00:00' or '-00:00'. # Zulu indicator 'Z' rather than the numeric offsets '+00:00' or '-00:00'.
# (RFC 3339 permits and special cases '-00:00', however, it looks like it's # (RFC 3339 permits and special cases '-00:00', however, it looks like it's
# not possible to create a negative zero offset in Python.) # not possible to create a negative zero offset in Python.)
created_at = tz_aware_created_at.isoformat().replace("+00:00", "Z") rfc3339_created_at = tz_aware_created_at.isoformat().replace("+00:00", "Z")
# Each twt’s hash is calculated using its author, timestamp and contents. # Each twt’s hash is calculated using its author, timestamp and contents.
# The author feed URL, RFC 3339 formatted timestamp and twt text are joined # The author feed URL, RFC 3339 formatted timestamp and twt text are joined
# with line feeds: # with line feeds:
payload = "%s\n%s\n%s" % (twt.source.url, created_at, twt.text) payload = "%s\n%s\n%s" % (source.url, rfc3339_created_at, text)
# This UTF-8 encoded string is Blake2b hashed with 256 bits… # This UTF-8 encoded string is Blake2b hashed with 256 bits…
# (256 bits are 32 bytes) # (256 bits are 32 bytes)
...@@ -68,21 +75,21 @@ def create_hash(twt): ...@@ -68,21 +75,21 @@ def create_hash(twt):
return hash[-7:].lower() return hash[-7:].lower()
def create_old_hash(twt): def create_old_hash(created_at, text, source):
""" """
Create the hash of the given tweet according to prologic's blog post which Create the hash of the given tweet according to prologic's blog post which
is used to form conversations: is used to form conversations:
https://twtxt.net/blog/prologic/2020/10/18/making-twtxt-better https://twtxt.net/blog/prologic/2020/10/18/making-twtxt-better
""" """
created_at = twt.created_at.strftime("%Y-%m-%d %H:%M:%S %z") go_string_created_at = created_at.strftime("%Y-%m-%d %H:%M:%S %z")
tz = created_at[20:] tz = go_string_created_at[20:]
tz_abbr = _tz_abbr_cache.get(tz) tz_abbr = _tz_abbr_cache.get(tz)
if tz_abbr is None: if tz_abbr is None:
print("ERROR: no entry for TZ offset %s" % tz) print("ERROR: no entry for TZ offset %s" % tz)
created_at += " %s" % tz_abbr go_string_created_at += " %s" % tz_abbr
payload = "%s\n%s\n%s" % (twt.source.url, created_at, twt.text) payload = "%s\n%s\n%s" % (source.url, go_string_created_at, text)
sum256 = hashlib.blake2b(payload.encode("utf-8"), digest_size=32).digest() # 32 bytes are 256 bits sum256 = hashlib.blake2b(payload.encode("utf-8"), digest_size=32).digest() # 32 bytes are 256 bits
hash = base64.b32encode(sum256).decode("ascii") # thank you for the ASCII bytes… hash = base64.b32encode(sum256).decode("ascii") # thank you for the ASCII bytes…
hash = hash.replace("=", "") # no padding hash = hash.replace("=", "") # no padding
......
...@@ -133,8 +133,8 @@ class TwtxtManager: ...@@ -133,8 +133,8 @@ class TwtxtManager:
for further processing. for further processing.
""" """
twt.hash = twtxthash.create_hash(twt) twt.hash = twtxthash.create_hash(twt.created_at, twt.text, twt.source)
twt.old_hash = twtxthash.create_old_hash(twt) twt.old_hash = twtxthash.create_old_hash(twt.created_at, twt.text, twt.source)
if "\t" in twt.text: if "\t" in twt.text:
twt.text = twt.text.replace("\t", " ") twt.text = twt.text.replace("\t", " ")
twt.tokens = list(twtxtparser.parse_twt_text(twt.text)) twt.tokens = list(twtxtparser.parse_twt_text(twt.text))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment