Commit 4eae1732 authored by Lysander Trischler's avatar Lysander Trischler

Refactor hash interface for future reuse

This refactoring also uncovered some kind of bug, that got masked by the
`twtxt.models.Tweet` constructor, which did the actual conversion to
seconds precision. Without that Tweet container in between anymore, two
precision-related tests failed. So this precision handling needed to be
replicated into the hash creation – where it acutally belongs. One day,
we're finally getting rid of the `twtxt.models.Tweet` entirely, so it's
not really duplicated anymore.

This is part of surgically removing the twtxt reference implementation
of tt.
parent 94e1d640
......@@ -9,50 +9,50 @@ CET = datetime.timezone(datetime.timedelta(hours=1))
class CreateHashTest(unittest.TestCase):
def test_old_timestamp_format(self):
self.assertEqual("sqwl3la", create_old_hash(twtxt.models.Tweet(
self.assertEqual("sqwl3la", create_old_hash(
created_at=datetime.datetime(2020, 12, 6, 20, 20, 35, tzinfo=CET),
text="This is a test tweet for testing.",
source=twtxt.models.Source(nick="nick", url="http://0.0.0.0:8000/user/lyse/twtxt.txt"))))
source=twtxt.models.Source(nick="nick", url="http://0.0.0.0:8000/user/lyse/twtxt.txt")))
def test_rfc3339_timestamp_with_milliseconds_precision_is_truncated_to_seconds_precision(self):
self.assertEqual("74qtyjq", create_hash(twtxt.models.Tweet(
self.assertEqual("74qtyjq", create_hash(
created_at=datetime.datetime(2020, 12, 9, 15, 38, 42, 123, tzinfo=UTC),
text="The twt hash now uses the RFC 3339 timestamp format.",
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt"))))
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))
def test_rfc3339_timestamp_with_millisecomds_precision_is_truncated_to_seconds_precision_without_rounding(self):
self.assertEqual("74qtyjq", create_hash(twtxt.models.Tweet(
self.assertEqual("74qtyjq", create_hash(
created_at=datetime.datetime(2020, 12, 9, 15, 38, 42, 999, tzinfo=UTC),
text="The twt hash now uses the RFC 3339 timestamp format.",
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt"))))
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))
def test_rfc3339_timestamp_with_seconds_precision_and_utc_plus_1_offset_is_kept_intact(self):
self.assertEqual("64u2m5a", create_hash(twtxt.models.Tweet(
self.assertEqual("64u2m5a", create_hash(
created_at=datetime.datetime(2020, 12, 9, 16, 38, 42, tzinfo=CET),
text="The twt hash now uses the RFC 3339 timestamp format.",
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt"))))
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))
def test_rfc3339_timestamp_with_minutes_precision_is_expanded_to_seconds_precision(self):
self.assertEqual("a3c3k5q", create_hash(twtxt.models.Tweet(
created_at=datetime.datetime(2020, 12, 9, 16, 38, 0, tzinfo=CET),
self.assertEqual("a3c3k5q", create_hash(
created_at=datetime.datetime(2020, 12, 9, 16, 38, tzinfo=CET),
text="The twt hash now uses the RFC 3339 timestamp format.",
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt"))))
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))
def test_rfc3339_timestamp_with_utc_is_rendered_as_designated_zulu_offset_rather_than_numeric_offset(self):
self.assertEqual("74qtyjq", create_hash(twtxt.models.Tweet(
self.assertEqual("74qtyjq", create_hash(
created_at=datetime.datetime(2020, 12, 9, 15, 38, 42, tzinfo=UTC),
text="The twt hash now uses the RFC 3339 timestamp format.",
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt"))))
self.assertEqual("74qtyjq", create_hash(twtxt.models.Tweet(
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))
self.assertEqual("74qtyjq", create_hash(
created_at=datetime.datetime(2020, 12, 9, 15, 38, 42, tzinfo=datetime.timezone(datetime.timedelta(hours=-0))),
text="The twt hash now uses the RFC 3339 timestamp format.",
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt"))))
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))
def test_rfc3339_timestamp_without_explicit_timezone_information_is_assumed_to_be_in_utc(self):
self.assertEqual("74qtyjq", create_hash(twtxt.models.Tweet(
self.assertEqual("74qtyjq", create_hash(
created_at=datetime.datetime(2020, 12, 9, 15, 38, 42),
text="The twt hash now uses the RFC 3339 timestamp format.",
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt"))))
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))
if __name__ == "__main__":
......
......@@ -20,7 +20,7 @@ _tz_abbr_cache = {
}
def create_hash(twt):
def create_hash(created_at, text, source):
"""
Create the hash of the given twt as specified in the Twt Hash Extension:
https://dev.twtxt.net/doc/twthashextension.html
......@@ -29,23 +29,30 @@ def create_hash(twt):
# (All comments in this function are taken from the aforementioned
# specification. Comments in parenthesis are mine, including this one.)
# The time must exactly be truncated or expanded to seconds precision. Any
# possible milliseconds must be cut off without any rounding. The seconds
# part of minutes precision times must be set to zero.
# (There is no millisecond keyword, we have to use microsecond, but it does
# exactly what we want.)
sec_precision_created_at = created_at.replace(microsecond=0)
# If the timestamp does not explicitly include any timezone information, it
# must be assumed to be in UTC.
if twt.created_at.tzinfo is None:
tz_aware_created_at = twt.created_at.replace(tzinfo=datetime.timezone.utc)
if created_at.tzinfo is None:
tz_aware_created_at = sec_precision_created_at.replace(tzinfo=datetime.timezone.utc)
else:
tz_aware_created_at = twt.created_at
tz_aware_created_at = sec_precision_created_at
# All timezones representing UTC must be formatted using the designated
# Zulu indicator 'Z' rather than the numeric offsets '+00:00' or '-00:00'.
# (RFC 3339 permits and special cases '-00:00', however, it looks like it's
# not possible to create a negative zero offset in Python.)
created_at = tz_aware_created_at.isoformat().replace("+00:00", "Z")
rfc3339_created_at = tz_aware_created_at.isoformat().replace("+00:00", "Z")
# Each twt’s hash is calculated using its author, timestamp and contents.
# The author feed URL, RFC 3339 formatted timestamp and twt text are joined
# with line feeds:
payload = "%s\n%s\n%s" % (twt.source.url, created_at, twt.text)
payload = "%s\n%s\n%s" % (source.url, rfc3339_created_at, text)
# This UTF-8 encoded string is Blake2b hashed with 256 bits…
# (256 bits are 32 bytes)
......@@ -68,21 +75,21 @@ def create_hash(twt):
return hash[-7:].lower()
def create_old_hash(twt):
def create_old_hash(created_at, text, source):
"""
Create the hash of the given tweet according to prologic's blog post which
is used to form conversations:
https://twtxt.net/blog/prologic/2020/10/18/making-twtxt-better
"""
created_at = twt.created_at.strftime("%Y-%m-%d %H:%M:%S %z")
tz = created_at[20:]
go_string_created_at = created_at.strftime("%Y-%m-%d %H:%M:%S %z")
tz = go_string_created_at[20:]
tz_abbr = _tz_abbr_cache.get(tz)
if tz_abbr is None:
print("ERROR: no entry for TZ offset %s" % tz)
created_at += " %s" % tz_abbr
go_string_created_at += " %s" % tz_abbr
payload = "%s\n%s\n%s" % (twt.source.url, created_at, twt.text)
payload = "%s\n%s\n%s" % (source.url, go_string_created_at, text)
sum256 = hashlib.blake2b(payload.encode("utf-8"), digest_size=32).digest() # 32 bytes are 256 bits
hash = base64.b32encode(sum256).decode("ascii") # thank you for the ASCII bytes…
hash = hash.replace("=", "") # no padding
......
......@@ -133,8 +133,8 @@ class TwtxtManager:
for further processing.
"""
twt.hash = twtxthash.create_hash(twt)
twt.old_hash = twtxthash.create_old_hash(twt)
twt.hash = twtxthash.create_hash(twt.created_at, twt.text, twt.source)
twt.old_hash = twtxthash.create_old_hash(twt.created_at, twt.text, twt.source)
if "\t" in twt.text:
twt.text = twt.text.replace("\t", " ")
twt.tokens = list(twtxtparser.parse_twt_text(twt.text))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment