From 6b576f102ab858e485d8980eabf4243cea3b8f9d Mon Sep 17 00:00:00 2001 From: robin Date: Fri, 27 May 2022 12:19:22 +0200 Subject: [PATCH] Add parsing for messages with lxml --- main.py | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index 069b414..6bb7283 100755 --- a/main.py +++ b/main.py @@ -3,6 +3,7 @@ from settings import * from mastodon import Mastodon from os.path import exists import tweepy +from lxml.html import document_fromstring def main(): if not exists(app_path): Mastodon.create_app( @@ -36,7 +37,33 @@ def main(): timeline = mastodon_api.account_statuses(mastodon_user,exclude_replies=True) for toot in timeline: if(toot.visibility == 'public' and toot.account.id == mastodon_user.id and not toot.reblog): - print(toot) + tootfrm(toot.content) #print(timeline) +def tootfrm(content): + content = content.replace('
', "\n") + res = document_fromstring(content) + printit(res, "") + print(res.text_content()) +def printit(parent, body, block=" "): + for el in parent: + # print(block, el, el.text_content()) + if 'u-url' in el.classes: + el.text = el.text_content() + "@grml.de" + el.find('.//span').text = '' + el.find('.//span').drop_tag() + # for classname in iter(el.classes): + # print(block + " " + classname) + body = printit(el, body, block + " ") + return body +def test(): + content = '

Python Test
@toot
twitter.com/mattxiv/status/152

' + # parser = MyHTMLParser() + content = content.replace('
', "\n") + res = document_fromstring(content) + print(res.text_content()) + body = printit(res, "") + print(body) + print(res.text_content()) if __name__ == "__main__": - main() \ No newline at end of file + main() + #test() \ No newline at end of file