Commit 744ff0dc authored by Lysander Trischler's avatar Lysander Trischler
Browse files

Recover invalid twtxt hashes in subjects

parent 38e015e0
......@@ -147,6 +147,24 @@ class ParseSubjectTest(unittest.TestCase):
self.assertEqual(SubjectHash(start_pos=0, end_pos=18, hash="abcd567", url=None),
_parse_subject("( #< abcd567 > ) more text", 0))
def test_only_non_twtxt_hash(self):
self.assertEqual(SubjectHash(start_pos=0, end_pos=10, hash="abcd567", url=None),
_parse_subject("(#abcd567) more text", 0))
self.assertEqual(SubjectHash(start_pos=0, end_pos=14, hash="abcd567", url=None),
_parse_subject("( #abcd567 ) more text", 0))
def test_only_non_twtxt_hash_too_short(self):
self.assertIsNone(_parse_subject("(#abcd56) more text", 0))
self.assertIsNone(_parse_subject("( #abcd56 ) more text", 0))
def test_only_non_twtxt_hash_too_long(self):
self.assertIsNone(_parse_subject("(#abcd5678) more text", 0))
self.assertIsNone(_parse_subject("( #abcd5678 ) more text", 0))
def test_only_non_twtxt_hash_with_other_text(self):
self.assertIsNone(_parse_subject("(#abcd567 a) more text", 0))
self.assertIsNone(_parse_subject("( #abcd567 a ) more text", 0))
def test_only_url(self):
self.assertEqual(SubjectHash(start_pos=0, end_pos=25, hash=None, url="https://example.com/"),
_parse_subject("(#<https://example.com/>)", 0))
......
......@@ -111,11 +111,36 @@ _TWT_HASH_RE = re.compile(r"^[a-z0-9]{7}$")
def _parse_subject(text, start_pos):
data, closing_pos = _extract_between(text, start_pos, None, "(", ")")
if data is None:
if not data:
return None
subject_hash_start_pos = _skip_whitespace(text, start_pos + 1)
if text[subject_hash_start_pos:subject_hash_start_pos + 2] != "#<":
return None
# This is not a valid twtxt hash tag, let's see if this looks at least
# like a non-twtxt hash, because lately this comes up every now and then
# in my timeline. Maybe we can recover the technically invalid twt hash
# from broken clients and put the twt in the correct conversation. Twt
# hashes are always seven chars long, so with the hash sign this makes a
# total length of eight.
if text[subject_hash_start_pos] != "#" or len(data) < 8:
# nope, we're out of luck
return None
hash_end_pos = subject_hash_start_pos + 8
hash = text[subject_hash_start_pos + 1:hash_end_pos]
if not _TWT_HASH_RE.match(hash):
return None
# there must be no other text in the subject, only optional whitespace
if text[hash_end_pos].isspace():
hash_end_pos = _skip_whitespace(text, start_pos=hash_end_pos,
end_pos=closing_pos)
if hash_end_pos + 1 != closing_pos:
return None
return SubjectHash(start_pos, closing_pos, hash, url=None)
# Chances are this is a real twtxt hash tag. At least the start looks like
# one.
subject_hash = _parse_subject_hash(text, subject_hash_start_pos, closing_pos)
if subject_hash is None:
return None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment