Commit c170319e authored by Lysander Trischler's avatar Lysander Trischler

Implement rudimentary feed parsing

parent 87ce55dd
import datetime
import twtxt.models
import twtxtmodels
import unittest
from twtxthash import create_hash, create_old_hash
......@@ -12,47 +12,60 @@ class CreateHashTest(unittest.TestCase):
self.assertEqual("sqwl3la", create_old_hash(
created_at=datetime.datetime(2020, 12, 6, 20, 20, 35, tzinfo=CET),
text="This is a test tweet for testing.",
source=twtxt.models.Source(nick="nick", url="http://0.0.0.0:8000/user/lyse/twtxt.txt")))
source=twtxtmodels.Source(nick="nick", url="http://0.0.0.0:8000/user/lyse/twtxt.txt")))
def test_rfc3339_timestamp_with_milliseconds_precision_is_truncated_to_seconds_precision(self):
self.assertEqual("74qtyjq", create_hash(
created_at=datetime.datetime(2020, 12, 9, 15, 38, 42, 123, tzinfo=UTC),
text="The twt hash now uses the RFC 3339 timestamp format.",
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))
source=twtxtmodels.Source(nick="nick", url="https://example.com/twtxt.txt")))
def test_rfc3339_timestamp_with_millisecomds_precision_is_truncated_to_seconds_precision_without_rounding(self):
self.assertEqual("74qtyjq", create_hash(
created_at=datetime.datetime(2020, 12, 9, 15, 38, 42, 999, tzinfo=UTC),
text="The twt hash now uses the RFC 3339 timestamp format.",
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))
source=twtxtmodels.Source(nick="nick", url="https://example.com/twtxt.txt")))
def test_rfc3339_timestamp_with_seconds_precision_and_utc_plus_1_offset_is_kept_intact(self):
self.assertEqual("64u2m5a", create_hash(
created_at=datetime.datetime(2020, 12, 9, 16, 38, 42, tzinfo=CET),
text="The twt hash now uses the RFC 3339 timestamp format.",
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))
source=twtxtmodels.Source(nick="nick", url="https://example.com/twtxt.txt")))
def test_rfc3339_timestamp_with_minutes_precision_is_expanded_to_seconds_precision(self):
self.assertEqual("a3c3k5q", create_hash(
created_at=datetime.datetime(2020, 12, 9, 16, 38, tzinfo=CET),
text="The twt hash now uses the RFC 3339 timestamp format.",
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))
source=twtxtmodels.Source(nick="nick", url="https://example.com/twtxt.txt")))
def test_rfc3339_timestamp_with_utc_is_rendered_as_designated_zulu_offset_rather_than_numeric_offset(self):
self.assertEqual("74qtyjq", create_hash(
created_at=datetime.datetime(2020, 12, 9, 15, 38, 42, tzinfo=UTC),
text="The twt hash now uses the RFC 3339 timestamp format.",
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))
source=twtxtmodels.Source(nick="nick", url="https://example.com/twtxt.txt")))
self.assertEqual("74qtyjq", create_hash(
created_at=datetime.datetime(2020, 12, 9, 15, 38, 42, tzinfo=datetime.timezone(datetime.timedelta(hours=-0))),
text="The twt hash now uses the RFC 3339 timestamp format.",
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))
source=twtxtmodels.Source(nick="nick", url="https://example.com/twtxt.txt")))
def test_rfc3339_timestamp_without_explicit_timezone_information_is_assumed_to_be_in_utc(self):
self.assertEqual("74qtyjq", create_hash(
created_at=datetime.datetime(2020, 12, 9, 15, 38, 42),
text="The twt hash now uses the RFC 3339 timestamp format.",
source=twtxt.models.Source(nick="nick", url="https://example.com/twtxt.txt")))
source=twtxtmodels.Source(nick="nick", url="https://example.com/twtxt.txt")))
def test_first_comment_meta_data_url_must_be_used_for_hashing_when_exists(self):
self.assertEqual("64u2m5a", create_hash(
created_at=datetime.datetime(2020, 12, 9, 16, 38, 42, tzinfo=CET),
text="The twt hash now uses the RFC 3339 timestamp format.",
source=twtxtmodels.Source(nick="nick", url="https://example.org/",
parsed_urls=["https://example.com/twtxt.txt"])))
self.assertEqual("64u2m5a", create_hash(
created_at=datetime.datetime(2020, 12, 9, 16, 38, 42, tzinfo=CET),
text="The twt hash now uses the RFC 3339 timestamp format.",
source=twtxtmodels.Source(nick="nick", url="https://example.org/",
parsed_urls=["https://example.com/twtxt.txt",
"https://example.net/twtxt.txt"])))
if __name__ == "__main__":
......
......@@ -13,12 +13,15 @@ from twtxtparser import (
_parse_plain_link,
parse_twt_text,
parse_twt,
_parse_feed_meta_data,
parse_feed,
Mention,
SubjectHash,
TwtxtLink,
MarkdownLink,
PlainLink,
Text,
FeedMetaData,
)
from twtxtmodels import Twt, Source
......@@ -412,6 +415,184 @@ class ParseTwtTest(unittest.TestCase):
subject="abcdefg"),
parse_twt("2021-09-23T01:14:34+02:00\t(#<abcdefg https://example.com/conv/abcdefg>) Hello World", self.source))
class ParseFeedMetaDataTest(unittest.TestCase):
def test_non_comment_returns_none(self):
self.assertIsNone(_parse_feed_meta_data("No comment"))
def test_missing_key_returns_none(self):
self.assertIsNone(_parse_feed_meta_data("# = https://example.com/twtxt/txt"))
def test_missing_equals_returns_none(self):
self.assertIsNone(_parse_feed_meta_data("# url https://example.com/twxt.txt"))
def test_missing_value_returns_none(self):
self.assertIsNone(_parse_feed_meta_data("# url ="))
self.assertIsNone(_parse_feed_meta_data("# url = "))
self.assertIsNone(_parse_feed_meta_data("#url= \t "))
def test_whitespace_is_optional(self):
self.assertEqual(FeedMetaData(key="url", value="https://example.com/twtxt.txt"),
_parse_feed_meta_data("# url = https://example.com/twtxt.txt"))
self.assertEqual(FeedMetaData(key="url", value="https://example.com/twtxt.txt"),
_parse_feed_meta_data("# url=https://example.com/twtxt.txt"))
self.assertEqual(FeedMetaData(key="url", value="https://example.com/twtxt.txt"),
_parse_feed_meta_data("#url=https://example.com/twtxt.txt"))
self.assertEqual(FeedMetaData(key="url", value="https://example.com/twtxt.txt"),
_parse_feed_meta_data(" # url = https://example.com/twtxt.txt "))
self.assertEqual(FeedMetaData(key="url", value="https://example.com/twtxt.txt"),
_parse_feed_meta_data("\t # url \t =\t https://example.com/twtxt.txt \t"))
def test_successive_comment_hashes_are_allowed(self):
self.assertEqual(FeedMetaData(key="url", value="https://example.com/twtxt.txt"),
_parse_feed_meta_data("##url=https://example.com/twtxt.txt"))
self.assertEqual(FeedMetaData(key="url", value="https://example.com/twtxt.txt"),
_parse_feed_meta_data(" ##url=https://example.com/twtxt.txt"))
self.assertEqual(FeedMetaData(key="url", value="https://example.com/twtxt.txt"),
_parse_feed_meta_data("\t # \t ## # url=https://example.com/twtxt.txt"))
def test_keys_are_converted_to_lowercase(self):
self.assertEqual(FeedMetaData(key="url", value="https://example.com/twtxt.txt"),
_parse_feed_meta_data("#URL=https://example.com/twtxt.txt"))
self.assertEqual(FeedMetaData(key="url", value="https://example.com/twtxt.txt"),
_parse_feed_meta_data("#Url=https://example.com/twtxt.txt"))
class ParseFeeedTest(unittest.TestCase):
def test_parsing_empty_feed_succeeeds(self):
self.assertEqual([], list(parse_feed("", Source(nick="hugo", url="https://example.com/hugo"))))
def test_parsing_single_twt_feed_without_newline_at_the_end_succeeds(self):
self.assertEqual([Twt(created_at=dt(2021, 9, 23, 1, 14, 34, tzinfo=tzoffset(hours=2)),
text="Hello World",
source=Source(nick="hugo", url="https://example.com/hugo"),
hash="uumpjyq",
old_hash="3iinjfa",
tokens=[Text(start_pos=0, end_pos=11, text="Hello World")])],
list(parse_feed("2021-09-23T01:14:34+02:00\tHello World",
Source(nick="hugo", url="https://example.com/hugo"))))
def test_parsing_single_twt_feed_with_newline_at_the_end_succeeds(self):
self.assertEqual([Twt(created_at=dt(2021, 9, 23, 1, 14, 34, tzinfo=tzoffset(hours=2)),
text="Hello World",
source=Source(nick="hugo", url="https://example.com/hugo"),
hash="uumpjyq",
old_hash="3iinjfa",
tokens=[Text(start_pos=0, end_pos=11, text="Hello World")])],
list(parse_feed("2021-09-23T01:14:34+02:00\tHello World\n",
Source(nick="hugo", url="https://example.com/hugo"))))
def test_parsing_multi_twts_feed_succeeds(self):
expected_source = Source(nick="hugo", url="https://example.com/hugo")
self.assertEqual([Twt(created_at=dt(2021, 9, 23, 1, 14, 34, tzinfo=tzoffset(hours=2)),
text="Hello World",
source=expected_source,
hash="uumpjyq",
old_hash="3iinjfa",
tokens=[Text(start_pos=0, end_pos=11, text="Hello World")]),
Twt(created_at=dt(2021, 9, 24, 12, 45, 17, tzinfo=tzoffset(hours=2)),
text="My second twt!",
source=expected_source,
hash="l7v22hq",
old_hash="g5nmpha",
tokens=[Text(start_pos=0, end_pos=14, text="My second twt!")])],
list(parse_feed("2021-09-23T01:14:34+02:00\tHello World\n"
"2021-09-24T12:45:17+02:00\tMy second twt!\n",
Source(nick="hugo", url="https://example.com/hugo"))))
def test_parsing_ignores_empty_lines(self):
expected_source = Source(nick="hugo", url="https://example.com/hugo")
self.assertEqual([Twt(created_at=dt(2021, 9, 23, 1, 14, 34, tzinfo=tzoffset(hours=2)),
text="Hello World",
source=expected_source,
hash="uumpjyq",
old_hash="3iinjfa",
tokens=[Text(start_pos=0, end_pos=11, text="Hello World")]),
Twt(created_at=dt(2021, 9, 24, 12, 45, 17, tzinfo=tzoffset(hours=2)),
text="My second twt!",
source=expected_source,
hash="l7v22hq",
old_hash="g5nmpha",
tokens=[Text(start_pos=0, end_pos=14, text="My second twt!")])],
list(parse_feed("\n"
"2021-09-23T01:14:34+02:00\tHello World\n"
"\n"
"2021-09-24T12:45:17+02:00\tMy second twt!\n"
"\n",
Source(nick="hugo", url="https://example.com/hugo"))))
def test_parsing_ignores_comments(self):
expected_source = Source(nick="hugo", url="https://example.com/hugo")
self.assertEqual([Twt(created_at=dt(2021, 9, 23, 1, 14, 34, tzinfo=tzoffset(hours=2)),
text="Hello World",
source=expected_source,
hash="uumpjyq",
old_hash="3iinjfa",
tokens=[Text(start_pos=0, end_pos=11, text="Hello World")]),
Twt(created_at=dt(2021, 9, 24, 12, 45, 17, tzinfo=tzoffset(hours=2)),
text="My second twt!",
source=expected_source,
hash="l7v22hq",
old_hash="g5nmpha",
tokens=[Text(start_pos=0, end_pos=14, text="My second twt!")])],
list(parse_feed("# Some comment\n"
"2021-09-23T01:14:34+02:00\tHello World\n"
" #Another comment\n"
"2021-09-24T12:45:17+02:00\tMy second twt!\n"
"##For good measure one more\n",
Source(nick="hugo", url="https://example.com/hugo"))))
def test_parsing_extracts_all_url_meta_data_comments_and_uses_first_one_for_twt_hashing_from_now_on(self):
expected_source = Source(nick="hugo", url="https://example.com/hugo",
parsed_urls=["https://example.com/twtxt.txt",
"even-not an URL, but extracted nonetheless for now",
"https://example.org/"])
self.assertEqual([Twt(created_at=dt(2021, 9, 23, 1, 14, 34, tzinfo=tzoffset(hours=2)),
text="Hello World",
source=expected_source,
hash="uumpjyq",
old_hash="3iinjfa",
tokens=[Text(start_pos=0, end_pos=11, text="Hello World")]),
Twt(created_at=dt(2021, 9, 24, 12, 45, 17, tzinfo=tzoffset(hours=2)),
text="My second twt!",
source=expected_source,
hash="5k4u2mq",
old_hash="g5nmpha",
tokens=[Text(start_pos=0, end_pos=14, text="My second twt!")]),
Twt(created_at=dt(2021, 9, 24, 12, 53, 0, tzinfo=tzoffset(hours=2)),
text="This twt is hashed with first encountered URL",
source=expected_source,
hash="qph3kkq",
old_hash="d5svs3a",
tokens=[Text(start_pos=0, end_pos=45, text="This twt is hashed with first encountered URL")])],
list(parse_feed("# this first twt will be hashed using the original source URL\n"
"2021-09-23T01:14:34+02:00\tHello World\n"
"# from now on, the new URL must be used\n"
"# url = https://example.com/twtxt.txt\n"
"2021-09-24T12:45:17+02:00\tMy second twt!\n"
"##url=even-not an URL, but extracted nonetheless for now\n"
"#url = https://example.org/\n"
"2021-09-24T12:53:00+02:00\tThis twt is hashed with first encountered URL\n",
Source(nick="hugo", url="https://example.com/hugo"))))
def test_parsing_extracts_first_nick_meta_data_comment_and_stores_it_in_source(self):
self.assertEqual([Twt(created_at=dt(2021, 9, 23, 1, 14, 34, tzinfo=tzoffset(hours=2)),
text="Hello World",
source=Source("hugo", url="https://example.com/hugo", parsed_nick="eugen"),
hash="uumpjyq",
old_hash="3iinjfa",
tokens=[Text(start_pos=0, end_pos=11, text="Hello World")])],
list(parse_feed("2021-09-23T01:14:34+02:00\tHello World\n"
"# nick = eugen\n"
"# nick = second one is ignored\n",
Source(nick="hugo", url="https://example.com/hugo"))))
# def test_parsing_uses_url_comments_for_hash_calculation(self):
# raise NotImplementedError
if __name__ == "__main__":
unittest.main()
......@@ -49,10 +49,22 @@ def create_hash(created_at, text, source):
# not possible to create a negative zero offset in Python.)
rfc3339_created_at = tz_aware_created_at.isoformat().replace("+00:00", "Z")
# Feeds can include metadata at the beginning. This includes one or more
# 'url' fields:
# If 'url' fields are present, the first one must be used for hashing. If
# none are present, then the URL which was used to retrieve the feed must
# be used.
# (TODO once we use only our own data model, this can be simplified!)
parsed_urls = getattr(source, "parsed_urls", None)
if parsed_urls:
hashing_url = parsed_urls[0]
else:
hashing_url = source.url
# Each twt’s hash is calculated using its author, timestamp and contents.
# The author feed URL, RFC 3339 formatted timestamp and twt text are joined
# with line feeds:
payload = "%s\n%s\n%s" % (source.url, rfc3339_created_at, text)
payload = "%s\n%s\n%s" % (hashing_url, rfc3339_created_at, text)
# This UTF-8 encoded string is Blake2b hashed with 256 bits…
# (256 bits are 32 bytes)
......
import twtxt.models
class Twt(twtxt.models.Tweet):
"""
Twt is a single twt enhanced with parsed information.
"""
def __init__(self, created_at, text, source=None,
hash=None, old_hash=None,
tokens=None, subject=None,
replies=None, read=None):
super().__init__(created_at=created_at, text=text, source=source)
self.hash = hash
self.old_hash = old_hash
self.tokens = tokens
self.subject = subject
self.replies = replies
self.read = read
def __eq__(self, other):
if other is None \
or self.created_at != other.created_at \
or self.text != other.text \
or self.source != other.source:
return False
for attr in ("hash", "old_hash", "tokens", "subject", "replies", "read"):
a = getattr(self, attr)
b = getattr(other, attr, None)
if a != b:
return False
return True
def __repr__(self):
return f"Twt(created_at={self.created_at!r}, text={self.text!r}, " \
f"source={self.source!r}, hash={self.hash!r}, old_hash={self.old_hash!r}, " \
f"tokens={self.tokens!r}, subject={self.subject!r}, " \
f"replies={self.replies!r}, read={self.read!r})"
class Source:
"""
Source is a twtxt feed, either local or remote.
"""
def __init__(self, nick, url=None, file=None, parsed_nick=None, parsed_urls=None,
meta_data=None):
self.nick = nick
self.url = url
self.file = file
self.parsed_nick = parsed_nick
self.parsed_urls = parsed_urls
self.meta_data = meta_data
@property
def hashing_url(self):
"""
The URL used for hashing. It defaults to the source URL but can be
overridden using a meta data comment in the twtxt.txt feed.
"""
if self.parsed_urls:
return self.parsed_urls[0]
return self.url
def __eq__(self, other):
return other is not None \
and self.nick == other.nick \
and self.url == other.url \
and self.file == other.file \
and self.parsed_nick == other.parsed_nick \
and self.parsed_urls == other.parsed_urls \
and self.meta_data == other.meta_data
def __repr__(self):
return f"Source(nick={self.nick!r}, url={self.url!r}, file={self.file!r}, "\
f"parsed_nick={self.parsed_nick!r}, parsed_urls={self.parsed_urls!r}, " \
f"meta_data={self.meta_data!r})"
......@@ -39,6 +39,7 @@ SubjectHash = _token_class('SubjectHash', 'hash', 'url')
TwtxtLink = _token_class('TwtxtLink', 'title', 'url')
MarkdownLink = _token_class('MarkdownLink', 'title', 'url')
PlainLink = _token_class('PlainLink', 'url')
FeedMetaData = collections.namedtuple('FeedMetaData', ('key', 'value'))
def _skip_whitespace(text, start_pos, end_pos=None):
......@@ -292,3 +293,58 @@ def parse_twt(line, source=None):
return twtxtmodels.Twt(created_at=created_at, text=text, source=source,
hash=hash, old_hash=old_hash, tokens=tokens, subject=subject)
_FEED_META_DATA_RE = re.compile(r'\s*#[#\s]*([a-zA-Z0-9_-]+)\s*=\s*(\S.+)\s*')
def _parse_feed_meta_data(line):
"""
Parse a feed meta data from a comment line in a twtxt feed. If it cannot be
matched, `None` is returned.
"""
match = _FEED_META_DATA_RE.match(line)
if not match:
return None
return FeedMetaData(key=match.group(1).lower(), value=match.group(2).strip())
def parse_feed(feed, source):
for line in feed.splitlines():
lstripped_line = line.lstrip()
# ignore empty lines
if not lstripped_line:
continue
# extract supported meta data from comment and ignore the rest
if lstripped_line.startswith("#"):
feed_meta_data = _parse_feed_meta_data(lstripped_line)
if not feed_meta_data:
# ignore regular comment
continue
if feed_meta_data.key == "url":
if source.parsed_urls is None:
source.parsed_urls = [feed_meta_data.value]
else:
source.parsed_urls.append(feed_meta_data.value)
elif feed_meta_data.key == "nick":
# Only keep the first nick. Not sure if there is a usecase
# where multiple different nicks would be needed. Storing just
# one is a little bit simpler, that's really the only reason.
# By keeping the first one, it's closer to the 'url' meta data
# comment handling, where the first URL is more special than
# the others because it is used for twt hashing.
#
# Now that I've written such a long comment, explaining the
# reasoning, I could have just supported multiple nicks in the
# same time instead… But I won't do it now. Just because.
if source.parsed_nick is None:
source.parsed_nick = feed_meta_data.value
else:
source.meta_data.append(feed_meta_data)
continue
# neither blank line nor comment, thus it should be a twt
yield parse_twt(line, source)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment