Add parsing for messages with lxml

This commit is contained in:
robin 2022-05-27 12:19:22 +02:00
parent c0c7aa772f
commit 6b576f102a
1 changed files with 29 additions and 2 deletions

31
main.py
View File

@ -3,6 +3,7 @@ from settings import *
from mastodon import Mastodon
from os.path import exists
import tweepy
from lxml.html import document_fromstring
def main():
if not exists(app_path):
Mastodon.create_app(
@ -36,7 +37,33 @@ def main():
timeline = mastodon_api.account_statuses(mastodon_user,exclude_replies=True)
for toot in timeline:
if(toot.visibility == 'public' and toot.account.id == mastodon_user.id and not toot.reblog):
print(toot)
tootfrm(toot.content)
#print(timeline)
def tootfrm(content):
content = content.replace('<br />', "\n")
res = document_fromstring(content)
printit(res, "")
print(res.text_content())
def printit(parent, body, block=" "):
for el in parent:
# print(block, el, el.text_content())
if 'u-url' in el.classes:
el.text = el.text_content() + "@grml.de"
el.find('.//span').text = ''
el.find('.//span').drop_tag()
# for classname in iter(el.classes):
# print(block + " " + classname)
body = printit(el, body, block + " ")
return body
def test():
content = '<p>Python Test <a href="https://mastodon.grml.de/tags/ignore" class="mention hashtag" rel="tag">#<span>ignore</span></a><br /><span class="h-card"><a href="https://mastodon.grml.de/@toot" class="u-url mention">@<span>toot</span></a></span><br /><a href="https://twitter.com/mattxiv/status/1529181072931659777" target="_blank" rel="nofollow noopener noreferrer"><span class="invisible">https://</span><span class="ellipsis">twitter.com/mattxiv/status/152</span><span class="invisible">9181072931659777</span></a><br /><a href="https://mastodon.grml.de/tags/python" class="mention hashtag" rel="tag">#<span>python</span></a> <a href="https://mastodon.grml.de/tags/test" class="mention hashtag" rel="tag">#<span>test</span></a></p>'
# parser = MyHTMLParser()
content = content.replace('<br />', "\n")
res = document_fromstring(content)
print(res.text_content())
body = printit(res, "")
print(body)
print(res.text_content())
if __name__ == "__main__":
main()
main()
#test()