Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Lyse
tt
Commits
744ff0dc
Commit
744ff0dc
authored
Jan 23, 2021
by
Lysander Trischler
Browse files
Recover invalid twtxt hashes in subjects
parent
38e015e0
Changes
2
Hide whitespace changes
Inline
Side-by-side
test_twtxtparser.py
View file @
744ff0dc
...
...
@@ -147,6 +147,24 @@ class ParseSubjectTest(unittest.TestCase):
self
.
assertEqual
(
SubjectHash
(
start_pos
=
0
,
end_pos
=
18
,
hash
=
"abcd567"
,
url
=
None
),
_parse_subject
(
"( #< abcd567 > ) more text"
,
0
))
def
test_only_non_twtxt_hash
(
self
):
self
.
assertEqual
(
SubjectHash
(
start_pos
=
0
,
end_pos
=
10
,
hash
=
"abcd567"
,
url
=
None
),
_parse_subject
(
"(#abcd567) more text"
,
0
))
self
.
assertEqual
(
SubjectHash
(
start_pos
=
0
,
end_pos
=
14
,
hash
=
"abcd567"
,
url
=
None
),
_parse_subject
(
"( #abcd567 ) more text"
,
0
))
def
test_only_non_twtxt_hash_too_short
(
self
):
self
.
assertIsNone
(
_parse_subject
(
"(#abcd56) more text"
,
0
))
self
.
assertIsNone
(
_parse_subject
(
"( #abcd56 ) more text"
,
0
))
def
test_only_non_twtxt_hash_too_long
(
self
):
self
.
assertIsNone
(
_parse_subject
(
"(#abcd5678) more text"
,
0
))
self
.
assertIsNone
(
_parse_subject
(
"( #abcd5678 ) more text"
,
0
))
def
test_only_non_twtxt_hash_with_other_text
(
self
):
self
.
assertIsNone
(
_parse_subject
(
"(#abcd567 a) more text"
,
0
))
self
.
assertIsNone
(
_parse_subject
(
"( #abcd567 a ) more text"
,
0
))
def
test_only_url
(
self
):
self
.
assertEqual
(
SubjectHash
(
start_pos
=
0
,
end_pos
=
25
,
hash
=
None
,
url
=
"https://example.com/"
),
_parse_subject
(
"(#<https://example.com/>)"
,
0
))
...
...
twtxtparser.py
View file @
744ff0dc
...
...
@@ -111,11 +111,36 @@ _TWT_HASH_RE = re.compile(r"^[a-z0-9]{7}$")
def
_parse_subject
(
text
,
start_pos
):
data
,
closing_pos
=
_extract_between
(
text
,
start_pos
,
None
,
"("
,
")"
)
if
data
is
None
:
if
not
data
:
return
None
subject_hash_start_pos
=
_skip_whitespace
(
text
,
start_pos
+
1
)
if
text
[
subject_hash_start_pos
:
subject_hash_start_pos
+
2
]
!=
"#<"
:
return
None
# This is not a valid twtxt hash tag, let's see if this looks at least
# like a non-twtxt hash, because lately this comes up every now and then
# in my timeline. Maybe we can recover the technically invalid twt hash
# from broken clients and put the twt in the correct conversation. Twt
# hashes are always seven chars long, so with the hash sign this makes a
# total length of eight.
if
text
[
subject_hash_start_pos
]
!=
"#"
or
len
(
data
)
<
8
:
# nope, we're out of luck
return
None
hash_end_pos
=
subject_hash_start_pos
+
8
hash
=
text
[
subject_hash_start_pos
+
1
:
hash_end_pos
]
if
not
_TWT_HASH_RE
.
match
(
hash
):
return
None
# there must be no other text in the subject, only optional whitespace
if
text
[
hash_end_pos
].
isspace
():
hash_end_pos
=
_skip_whitespace
(
text
,
start_pos
=
hash_end_pos
,
end_pos
=
closing_pos
)
if
hash_end_pos
+
1
!=
closing_pos
:
return
None
return
SubjectHash
(
start_pos
,
closing_pos
,
hash
,
url
=
None
)
# Chances are this is a real twtxt hash tag. At least the start looks like
# one.
subject_hash
=
_parse_subject_hash
(
text
,
subject_hash_start_pos
,
closing_pos
)
if
subject_hash
is
None
:
return
None
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment