content.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826
  1. __filename__ = "content.py"
  2. __author__ = "Bob Mottram"
  3. __license__ = "AGPL3+"
  4. __version__ = "1.1.0"
  5. __maintainer__ = "Bob Mottram"
  6. __email__ = "bob@freedombone.net"
  7. __status__ = "Production"
  8. import os
  9. import email.parser
  10. from shutil import copyfile
  11. from utils import loadJson
  12. from utils import fileLastModified
  13. from utils import getLinkPrefixes
  14. def htmlReplaceQuoteMarks(content: str) -> str:
  15. """Replaces quotes with html formatting
  16. "hello" becomes <q>hello</q>
  17. """
  18. if '"' not in content:
  19. if '&quot;' not in content:
  20. return content
  21. newContent = content
  22. if '"' in content:
  23. sections = content.split('"')
  24. if len(sections) > 1:
  25. newContent = ''
  26. openQuote = True
  27. markup = False
  28. for ch in content:
  29. currChar = ch
  30. if ch == '<':
  31. markup = True
  32. elif ch == '>':
  33. markup = False
  34. elif ch == '"' and not markup:
  35. if openQuote:
  36. currChar = '“'
  37. else:
  38. currChar = '”'
  39. openQuote = not openQuote
  40. newContent += currChar
  41. if '&quot;' in newContent:
  42. openQuote = True
  43. content = newContent
  44. newContent = ''
  45. ctr = 0
  46. sections = content.split('&quot;')
  47. noOfSections = len(sections)
  48. for s in sections:
  49. newContent += s
  50. if ctr < noOfSections - 1:
  51. if openQuote:
  52. newContent += '“'
  53. else:
  54. newContent += '”'
  55. openQuote = not openQuote
  56. ctr += 1
  57. return newContent
  58. def dangerousMarkup(content: str) -> bool:
  59. """Returns true if the given content contains dangerous html markup
  60. """
  61. if '<' not in content:
  62. return False
  63. if '>' not in content:
  64. return False
  65. contentSections = content.split('<')
  66. invalidStrings = ('script', 'canvas', 'style', 'abbr',
  67. 'frame', 'iframe', 'html', 'body',
  68. 'hr')
  69. for markup in contentSections:
  70. if '>' not in markup:
  71. continue
  72. markup = markup.split('>')[0].strip()
  73. if ' ' not in markup:
  74. for badStr in invalidStrings:
  75. if badStr in markup:
  76. return True
  77. else:
  78. for badStr in invalidStrings:
  79. if badStr + ' ' in markup:
  80. return True
  81. return False
  82. def switchWords(baseDir: str, nickname: str, domain: str, content: str) -> str:
  83. """Performs word replacements. eg. Trump -> The Orange Menace
  84. """
  85. switchWordsFilename = baseDir + '/accounts/' + \
  86. nickname + '@' + domain + '/replacewords.txt'
  87. if not os.path.isfile(switchWordsFilename):
  88. return content
  89. with open(switchWordsFilename, 'r') as fp:
  90. for line in fp:
  91. replaceStr = line.replace('\n', '').replace('\r', '')
  92. wordTransform = None
  93. if '->' in replaceStr:
  94. wordTransform = replaceStr.split('->')
  95. elif ':' in replaceStr:
  96. wordTransform = replaceStr.split(':')
  97. elif ',' in replaceStr:
  98. wordTransform = replaceStr.split(',')
  99. elif ';' in replaceStr:
  100. wordTransform = replaceStr.split(';')
  101. elif '-' in replaceStr:
  102. wordTransform = replaceStr.split('-')
  103. if not wordTransform:
  104. continue
  105. if len(wordTransform) == 2:
  106. replaceStr1 = wordTransform[0].strip().replace('"', '')
  107. replaceStr2 = wordTransform[1].strip().replace('"', '')
  108. content = content.replace(replaceStr1, replaceStr2)
  109. return content
  110. def replaceEmojiFromTags(content: str, tag: [], messageType: str) -> str:
  111. """Uses the tags to replace :emoji: with html image markup
  112. """
  113. for tagItem in tag:
  114. if not tagItem.get('type'):
  115. continue
  116. if tagItem['type'] != 'Emoji':
  117. continue
  118. if not tagItem.get('name'):
  119. continue
  120. if not tagItem.get('icon'):
  121. continue
  122. if not tagItem['icon'].get('url'):
  123. continue
  124. if '/' not in tagItem['icon']['url']:
  125. continue
  126. if tagItem['name'] not in content:
  127. continue
  128. iconName = tagItem['icon']['url'].split('/')[-1]
  129. if iconName:
  130. if len(iconName) > 1:
  131. if iconName[0].isdigit():
  132. if '.' in iconName:
  133. iconName = iconName.split('.')[0]
  134. # see https://unicode.org/
  135. # emoji/charts/full-emoji-list.html
  136. if '-' not in iconName:
  137. # a single code
  138. try:
  139. replaceChar = chr(int("0x" + iconName, 16))
  140. content = content.replace(tagItem['name'],
  141. replaceChar)
  142. except BaseException:
  143. pass
  144. else:
  145. # sequence of codes
  146. iconCodes = iconName.split('-')
  147. iconCodeSequence = ''
  148. for icode in iconCodes:
  149. try:
  150. iconCodeSequence += chr(int("0x" +
  151. icode, 16))
  152. except BaseException:
  153. iconCodeSequence = ''
  154. break
  155. if iconCodeSequence:
  156. content = content.replace(tagItem['name'],
  157. iconCodeSequence)
  158. htmlClass = 'emoji'
  159. if messageType == 'post header':
  160. htmlClass = 'emojiheader'
  161. if messageType == 'profile':
  162. htmlClass = 'emojiprofile'
  163. emojiHtml = "<img src=\"" + tagItem['icon']['url'] + "\" alt=\"" + \
  164. tagItem['name'].replace(':', '') + \
  165. "\" align=\"middle\" class=\"" + htmlClass + "\"/>"
  166. content = content.replace(tagItem['name'], emojiHtml)
  167. return content
  168. def addMusicTag(content: str, tag: str) -> str:
  169. """If a music link is found then ensure that the post is
  170. tagged appropriately
  171. """
  172. if '#' not in tag:
  173. tag = '#'+tag
  174. if tag in content:
  175. return content
  176. musicSites = ('soundcloud.com', 'bandcamp.com')
  177. musicSiteFound = False
  178. for site in musicSites:
  179. if site+'/' in content:
  180. musicSiteFound = True
  181. break
  182. if not musicSiteFound:
  183. return content
  184. return ':music: ' + content + ' ' + tag + ' '
  185. def addWebLinks(content: str) -> str:
  186. """Adds markup for web links
  187. """
  188. if ':' not in content:
  189. return content
  190. prefixes = getLinkPrefixes()
  191. # do any of these prefixes exist within the content?
  192. prefixFound = False
  193. for prefix in prefixes:
  194. if prefix in content:
  195. prefixFound = True
  196. break
  197. # if there are no prefixes then just keep the content we have
  198. if not prefixFound:
  199. return content
  200. maxLinkLength = 40
  201. content = content.replace('\r', '')
  202. words = content.replace('\n', ' --linebreak-- ').split(' ')
  203. replaceDict = {}
  204. for w in words:
  205. if ':' not in w:
  206. continue
  207. # does the word begin with a prefix?
  208. prefixFound = False
  209. for prefix in prefixes:
  210. if w.startswith(prefix):
  211. prefixFound = True
  212. break
  213. if not prefixFound:
  214. continue
  215. # the word contains a prefix
  216. if w.endswith('.') or w.endswith(';'):
  217. w = w[:-1]
  218. markup = '<a href="' + w + \
  219. '" rel="nofollow noopener" target="_blank">'
  220. for prefix in prefixes:
  221. if w.startswith(prefix):
  222. markup += '<span class="invisible">' + prefix + '</span>'
  223. break
  224. linkText = w
  225. for prefix in prefixes:
  226. linkText = linkText.replace(prefix, '')
  227. # prevent links from becoming too long
  228. if len(linkText) > maxLinkLength:
  229. markup += '<span class="ellipsis">' + \
  230. linkText[:maxLinkLength] + '</span>'
  231. markup += '<span class="invisible">' + \
  232. linkText[maxLinkLength:] + '</span></a>'
  233. else:
  234. markup += '<span class="ellipsis">' + linkText + '</span></a>'
  235. replaceDict[w] = markup
  236. # do the replacements
  237. for url, markup in replaceDict.items():
  238. content = content.replace(url, markup)
  239. # replace any line breaks
  240. content = content.replace(' --linebreak-- ', '<br>')
  241. return content
  242. def validHashTag(hashtag: str) -> bool:
  243. """Returns true if the give hashtag contains valid characters
  244. """
  245. # long hashtags are not valid
  246. if len(hashtag) >= 32:
  247. return False
  248. validChars = set('0123456789' +
  249. 'abcdefghijklmnopqrstuvwxyz' +
  250. 'ABCDEFGHIJKLMNOPQRSTUVWXYZ')
  251. if set(hashtag).issubset(validChars):
  252. return True
  253. return False
  254. def addHashTags(wordStr: str, httpPrefix: str, domain: str,
  255. replaceHashTags: {}, postHashtags: {}) -> bool:
  256. """Detects hashtags and adds them to the replacements dict
  257. Also updates the hashtags list to be added to the post
  258. """
  259. if replaceHashTags.get(wordStr):
  260. return True
  261. hashtag = wordStr[1:]
  262. if not validHashTag(hashtag):
  263. return False
  264. hashtagUrl = httpPrefix + "://" + domain + "/tags/" + hashtag
  265. postHashtags[hashtag] = {
  266. 'href': hashtagUrl,
  267. 'name': '#'+hashtag,
  268. 'type': 'Hashtag'
  269. }
  270. replaceHashTags[wordStr] = "<a href=\"" + hashtagUrl + \
  271. "\" class=\"mention hashtag\" rel=\"tag\">#<span>" + \
  272. hashtag + "</span></a>"
  273. return True
  274. def loadEmojiDict(emojiDataFilename: str, emojiDict: {}) -> None:
  275. """Creates an emoji dictionary based on emoji/emoji-data.txt
  276. """
  277. if not os.path.isfile(emojiDataFilename):
  278. return
  279. with open(emojiDataFilename, "r") as fileHandler:
  280. for line in fileHandler:
  281. if len(line) < 5:
  282. continue
  283. if line.startswith('#'):
  284. continue
  285. if '; Emoji' not in line:
  286. continue
  287. if ')' not in line:
  288. continue
  289. emojiUnicode = line.split(' ')[0]
  290. if len(emojiUnicode) < 4:
  291. continue
  292. if '..' in emojiUnicode:
  293. emojiUnicode = emojiUnicode.split('..')[0]
  294. emojiName = line.split(')', 1)[1].strip()
  295. emojiName = emojiName.replace('\n', '').replace('\r', '')
  296. emojiName = emojiName.replace(' ', '').replace('-', '')
  297. if '..' in emojiName:
  298. emojiName = emojiName.split('..')[0]
  299. emojiDict[emojiName.lower()] = emojiUnicode
  300. def addEmoji(baseDir: str, wordStr: str,
  301. httpPrefix: str, domain: str,
  302. replaceEmoji: {}, postTags: {},
  303. emojiDict: {}) -> bool:
  304. """Detects Emoji and adds them to the replacements dict
  305. Also updates the tags list to be added to the post
  306. """
  307. if not wordStr.startswith(':'):
  308. return False
  309. if not wordStr.endswith(':'):
  310. return False
  311. if len(wordStr) < 3:
  312. return False
  313. if replaceEmoji.get(wordStr):
  314. return True
  315. # remove leading and trailing : characters
  316. emoji = wordStr[1:]
  317. emoji = emoji[:-1]
  318. # is the text of the emoji valid?
  319. if not validHashTag(emoji):
  320. return False
  321. if not emojiDict.get(emoji):
  322. return False
  323. emojiFilename = baseDir + '/emoji/' + emojiDict[emoji] + '.png'
  324. if not os.path.isfile(emojiFilename):
  325. return False
  326. emojiUrl = httpPrefix + "://" + domain + \
  327. "/emoji/" + emojiDict[emoji] + '.png'
  328. postTags[emoji] = {
  329. 'icon': {
  330. 'mediaType': 'image/png',
  331. 'type': 'Image',
  332. 'url': emojiUrl
  333. },
  334. 'name': ':'+emoji+':',
  335. "updated": fileLastModified(emojiFilename),
  336. "id": emojiUrl.replace('.png', ''),
  337. 'type': 'Emoji'
  338. }
  339. return True
  340. def addMention(wordStr: str, httpPrefix: str, following: str,
  341. replaceMentions: {}, recipients: [], tags: {}) -> bool:
  342. """Detects mentions and adds them to the replacements dict and
  343. recipients list
  344. """
  345. possibleHandle = wordStr[1:]
  346. # @nick
  347. if following and '@' not in possibleHandle:
  348. # fall back to a best effort match against the following list
  349. # if no domain was specified. eg. @nick
  350. possibleNickname = possibleHandle
  351. for follow in following:
  352. if follow.startswith(possibleNickname + '@'):
  353. replaceDomain = \
  354. follow.replace('\n', '').replace('\r', '').split('@')[1]
  355. recipientActor = httpPrefix + "://" + \
  356. replaceDomain + "/users/" + possibleNickname
  357. if recipientActor not in recipients:
  358. recipients.append(recipientActor)
  359. tags[wordStr] = {
  360. 'href': recipientActor,
  361. 'name': wordStr,
  362. 'type': 'Mention'
  363. }
  364. replaceMentions[wordStr] = \
  365. "<span class=\"h-card\"><a href=\"" + httpPrefix + \
  366. "://" + replaceDomain + "/@" + possibleNickname + \
  367. "\" class=\"u-url mention\">@<span>" + possibleNickname + \
  368. "</span></a></span>"
  369. return True
  370. return False
  371. possibleNickname = None
  372. possibleDomain = None
  373. if '@' not in possibleHandle:
  374. return False
  375. possibleNickname = possibleHandle.split('@')[0]
  376. if not possibleNickname:
  377. return False
  378. possibleDomain = \
  379. possibleHandle.split('@')[1].strip('\n').strip('\r')
  380. if not possibleDomain:
  381. return False
  382. if following:
  383. for follow in following:
  384. if follow.replace('\n', '').replace('\r', '') != possibleHandle:
  385. continue
  386. recipientActor = httpPrefix + "://" + \
  387. possibleDomain + "/users/" + possibleNickname
  388. if recipientActor not in recipients:
  389. recipients.append(recipientActor)
  390. tags[wordStr] = {
  391. 'href': recipientActor,
  392. 'name': wordStr,
  393. 'type': 'Mention'
  394. }
  395. replaceMentions[wordStr] = \
  396. "<span class=\"h-card\"><a href=\"" + httpPrefix + \
  397. "://" + possibleDomain + "/@" + possibleNickname + \
  398. "\" class=\"u-url mention\">@<span>" + possibleNickname + \
  399. "</span></a></span>"
  400. return True
  401. # @nick@domain
  402. if not (possibleDomain == 'localhost' or '.' in possibleDomain):
  403. return False
  404. recipientActor = httpPrefix + "://" + \
  405. possibleDomain + "/users/" + possibleNickname
  406. if recipientActor not in recipients:
  407. recipients.append(recipientActor)
  408. tags[wordStr] = {
  409. 'href': recipientActor,
  410. 'name': wordStr,
  411. 'type': 'Mention'
  412. }
  413. replaceMentions[wordStr] = \
  414. "<span class=\"h-card\"><a href=\"" + httpPrefix + \
  415. "://" + possibleDomain + "/@" + possibleNickname + \
  416. "\" class=\"u-url mention\">@<span>" + possibleNickname + \
  417. "</span></a></span>"
  418. return True
  419. def replaceContentDuplicates(content: str) -> str:
  420. """Replaces invalid duplicates within content
  421. """
  422. while '<<' in content:
  423. content = content.replace('<<', '<')
  424. while '>>' in content:
  425. content = content.replace('>>', '>')
  426. content = content.replace('<\\p>', '')
  427. return content
  428. def removeTextFormatting(content: str) -> str:
  429. """Removes markup for bold, italics, etc
  430. """
  431. if '<' not in content:
  432. return content
  433. removeMarkup = ('b', 'i', 'ul', 'ol', 'li', 'em', 'strong',
  434. 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5')
  435. for markup in removeMarkup:
  436. content = content.replace('<' + markup + '>', '')
  437. content = content.replace('</' + markup + '>', '')
  438. content = content.replace('<' + markup.upper() + '>', '')
  439. content = content.replace('</' + markup.upper() + '>', '')
  440. return content
  441. def removeHtml(content: str) -> str:
  442. """Removes html links from the given content.
  443. Used to ensure that profile descriptions don't contain dubious content
  444. """
  445. if '<' not in content:
  446. return content
  447. removing = False
  448. content = content.replace('<q>', '"').replace('</q>', '"')
  449. result = ''
  450. for ch in content:
  451. if ch == '<':
  452. removing = True
  453. elif ch == '>':
  454. removing = False
  455. elif not removing:
  456. result += ch
  457. return result
  458. def removeLongWords(content: str, maxWordLength: int,
  459. longWordsList: []) -> str:
  460. """Breaks up long words so that on mobile screens this doesn't
  461. disrupt the layout
  462. """
  463. content = replaceContentDuplicates(content)
  464. if ' ' not in content:
  465. # handle a single very long string with no spaces
  466. contentStr = content.replace('<p>', '').replace(r'<\p>', '')
  467. if '://' not in contentStr:
  468. if len(contentStr) > maxWordLength:
  469. if '<p>' in content:
  470. content = '<p>' + contentStr[:maxWordLength] + r'<\p>'
  471. else:
  472. content = content[:maxWordLength]
  473. return content
  474. words = content.split(' ')
  475. if not longWordsList:
  476. longWordsList = []
  477. for wordStr in words:
  478. if len(wordStr) > maxWordLength:
  479. if wordStr not in longWordsList:
  480. longWordsList.append(wordStr)
  481. for wordStr in longWordsList:
  482. if wordStr.startswith('<'):
  483. continue
  484. if len(wordStr) == 76:
  485. if wordStr.upper() == wordStr:
  486. # tox address
  487. continue
  488. if '=\"' in wordStr:
  489. continue
  490. if '@' in wordStr:
  491. if '@@' not in wordStr:
  492. continue
  493. if '=.ed25519' in wordStr:
  494. continue
  495. if '.onion' in wordStr:
  496. continue
  497. if '.i2p' in wordStr:
  498. continue
  499. if 'https:' in wordStr:
  500. continue
  501. elif 'http:' in wordStr:
  502. continue
  503. elif 'i2p:' in wordStr:
  504. continue
  505. elif 'gnunet:' in wordStr:
  506. continue
  507. elif 'dat:' in wordStr:
  508. continue
  509. elif 'hyper:' in wordStr:
  510. continue
  511. elif 'briar:' in wordStr:
  512. continue
  513. if '<' in wordStr:
  514. replaceWord = wordStr.split('<', 1)[0]
  515. content = content.replace(wordStr, replaceWord)
  516. wordStr = replaceWord
  517. if '/' in wordStr:
  518. continue
  519. if len(wordStr[maxWordLength:]) < maxWordLength:
  520. content = content.replace(wordStr,
  521. wordStr[:maxWordLength] + '\n' +
  522. wordStr[maxWordLength:])
  523. else:
  524. content = content.replace(wordStr,
  525. wordStr[:maxWordLength])
  526. if content.startswith('<p>'):
  527. if not content.endswith('</p>'):
  528. content = content.strip()+'</p>'
  529. return content
  530. def addHtmlTags(baseDir: str, httpPrefix: str,
  531. nickname: str, domain: str, content: str,
  532. recipients: [], hashtags: {}, isJsonContent=False) -> str:
  533. """ Replaces plaintext mentions such as @nick@domain into html
  534. by matching against known following accounts
  535. """
  536. if content.startswith('<p>'):
  537. return htmlReplaceQuoteMarks(content)
  538. maxWordLength = 40
  539. content = content.replace('\r', '')
  540. content = content.replace('\n', ' --linebreak-- ')
  541. content = addMusicTag(content, 'nowplaying')
  542. words = content.replace(',', ' ').replace(';', ' ').split(' ')
  543. # remove . for words which are not mentions
  544. newWords = []
  545. for wordIndex in range(0, len(words)):
  546. wordStr = words[wordIndex]
  547. if wordStr.endswith('.'):
  548. if not wordStr.startswith('@'):
  549. wordStr = wordStr[:-1]
  550. if wordStr.startswith('.'):
  551. wordStr = wordStr[1:]
  552. newWords.append(wordStr)
  553. words = newWords
  554. replaceMentions = {}
  555. replaceHashTags = {}
  556. replaceEmoji = {}
  557. emojiDict = {}
  558. originalDomain = domain
  559. if ':' in domain:
  560. domain = domain.split(':')[0]
  561. followingFilename = baseDir + '/accounts/' + \
  562. nickname + '@' + domain + '/following.txt'
  563. # read the following list so that we can detect just @nick
  564. # in addition to @nick@domain
  565. following = None
  566. if '@' in words:
  567. if os.path.isfile(followingFilename):
  568. with open(followingFilename, "r") as f:
  569. following = f.readlines()
  570. # extract mentions and tags from words
  571. longWordsList = []
  572. for wordStr in words:
  573. wordLen = len(wordStr)
  574. if wordLen > 2:
  575. if wordLen > maxWordLength:
  576. longWordsList.append(wordStr)
  577. firstChar = wordStr[0]
  578. if firstChar == '@':
  579. if addMention(wordStr, httpPrefix, following,
  580. replaceMentions, recipients, hashtags):
  581. continue
  582. elif firstChar == '#':
  583. if addHashTags(wordStr, httpPrefix, originalDomain,
  584. replaceHashTags, hashtags):
  585. continue
  586. elif ':' in wordStr:
  587. wordStr2 = wordStr.split(':')[1]
  588. # print('TAG: emoji located - '+wordStr)
  589. if not emojiDict:
  590. # emoji.json is generated so that it can be customized and
  591. # the changes will be retained even if default_emoji.json
  592. # is subsequently updated
  593. if not os.path.isfile(baseDir + '/emoji/emoji.json'):
  594. copyfile(baseDir + '/emoji/default_emoji.json',
  595. baseDir + '/emoji/emoji.json')
  596. emojiDict = loadJson(baseDir + '/emoji/emoji.json')
  597. # print('TAG: looking up emoji for :'+wordStr2+':')
  598. addEmoji(baseDir, ':' + wordStr2 + ':', httpPrefix,
  599. originalDomain, replaceEmoji, hashtags,
  600. emojiDict)
  601. # replace words with their html versions
  602. for wordStr, replaceStr in replaceMentions.items():
  603. content = content.replace(wordStr, replaceStr)
  604. for wordStr, replaceStr in replaceHashTags.items():
  605. content = content.replace(wordStr, replaceStr)
  606. if not isJsonContent:
  607. for wordStr, replaceStr in replaceEmoji.items():
  608. content = content.replace(wordStr, replaceStr)
  609. content = addWebLinks(content)
  610. if longWordsList:
  611. content = removeLongWords(content, maxWordLength, longWordsList)
  612. content = content.replace(' --linebreak-- ', '</p><p>')
  613. return '<p>' + htmlReplaceQuoteMarks(content) + '</p>'
  614. def getMentionsFromHtml(htmlText: str,
  615. matchStr="<span class=\"h-card\"><a href=\"") -> []:
  616. """Extracts mentioned actors from the given html content string
  617. """
  618. mentions = []
  619. if matchStr not in htmlText:
  620. return mentions
  621. mentionsList = htmlText.split(matchStr)
  622. for mentionStr in mentionsList:
  623. if '"' not in mentionStr:
  624. continue
  625. actorStr = mentionStr.split('"')[0]
  626. if actorStr.startswith('http') or \
  627. actorStr.startswith('gnunet') or \
  628. actorStr.startswith('i2p') or \
  629. actorStr.startswith('hyper') or \
  630. actorStr.startswith('dat:'):
  631. if actorStr not in mentions:
  632. mentions.append(actorStr)
  633. return mentions
  634. def extractMediaInFormPOST(postBytes, boundary, name: str):
  635. """Extracts the binary encoding for image/video/audio within a http
  636. form POST
  637. Returns the media bytes and the remaining bytes
  638. """
  639. imageStartBoundary = b'Content-Disposition: form-data; name="' + \
  640. name.encode('utf8', 'ignore') + b'";'
  641. imageStartLocation = postBytes.find(imageStartBoundary)
  642. if imageStartLocation == -1:
  643. return None, postBytes
  644. # bytes after the start boundary appears
  645. mediaBytes = postBytes[imageStartLocation:]
  646. # look for the next boundary
  647. imageEndBoundary = boundary.encode('utf8', 'ignore')
  648. imageEndLocation = mediaBytes.find(imageEndBoundary)
  649. if imageEndLocation == -1:
  650. # no ending boundary
  651. return mediaBytes, postBytes[:imageStartLocation]
  652. # remaining bytes after the end of the image
  653. remainder = mediaBytes[imageEndLocation:]
  654. # remove bytes after the end boundary
  655. mediaBytes = mediaBytes[:imageEndLocation]
  656. # return the media and the before+after bytes
  657. return mediaBytes, postBytes[:imageStartLocation] + remainder
  658. def saveMediaInFormPOST(mediaBytes, debug: bool,
  659. filenameBase=None) -> (str, str):
  660. """Saves the given media bytes extracted from http form POST
  661. Returns the filename and attachment type
  662. """
  663. if not mediaBytes:
  664. if debug:
  665. print('DEBUG: No media found within POST')
  666. return None, None
  667. mediaLocation = -1
  668. searchStr = ''
  669. filename = None
  670. # directly search the binary array for the beginning
  671. # of an image
  672. extensionList = {
  673. 'png': 'image/png',
  674. 'jpeg': 'image/jpeg',
  675. 'gif': 'image/gif',
  676. 'webp': 'image/webp',
  677. 'mp4': 'video/mp4',
  678. 'ogv': 'video/ogv',
  679. 'mp3': 'audio/mpeg',
  680. 'ogg': 'audio/ogg'
  681. }
  682. detectedExtension = None
  683. for extension, contentType in extensionList.items():
  684. searchStr = b'Content-Type: ' + contentType.encode('utf8', 'ignore')
  685. mediaLocation = mediaBytes.find(searchStr)
  686. if mediaLocation > -1:
  687. # image/video/audio binaries
  688. if extension == 'jpeg':
  689. extension = 'jpg'
  690. elif extension == 'mpeg':
  691. extension = 'mp3'
  692. filename = filenameBase + '.' + extension
  693. attachmentMediaType = \
  694. searchStr.decode().split('/')[0].replace('Content-Type: ', '')
  695. detectedExtension = extension
  696. break
  697. if not filename:
  698. return None, None
  699. # locate the beginning of the image, after any
  700. # carriage returns
  701. startPos = mediaLocation + len(searchStr)
  702. for offset in range(1, 8):
  703. if mediaBytes[startPos+offset] != 10:
  704. if mediaBytes[startPos+offset] != 13:
  705. startPos += offset
  706. break
  707. # remove any existing image files with a different format
  708. extensionTypes = ('png', 'jpg', 'jpeg', 'gif', 'webp')
  709. for ex in extensionTypes:
  710. if ex == detectedExtension:
  711. continue
  712. possibleOtherFormat = \
  713. filename.replace('.temp', '').replace('.' +
  714. detectedExtension, '.' +
  715. ex)
  716. if os.path.isfile(possibleOtherFormat):
  717. os.remove(possibleOtherFormat)
  718. fd = open(filename, 'wb')
  719. fd.write(mediaBytes[startPos:])
  720. fd.close()
  721. return filename, attachmentMediaType
  722. def extractTextFieldsInPOST(postBytes, boundary, debug: bool) -> {}:
  723. """Returns a dictionary containing the text fields of a http form POST
  724. The boundary argument comes from the http header
  725. """
  726. msg = email.parser.BytesParser().parsebytes(postBytes)
  727. if debug:
  728. print('DEBUG: POST arriving ' +
  729. msg.get_payload(decode=True).decode('utf-8'))
  730. messageFields = msg.get_payload(decode=True)
  731. messageFields = messageFields.decode('utf-8').split(boundary)
  732. fields = {}
  733. # examine each section of the POST, separated by the boundary
  734. for f in messageFields:
  735. if f == '--':
  736. continue
  737. if ' name="' not in f:
  738. continue
  739. postStr = f.split(' name="', 1)[1]
  740. if '"' not in postStr:
  741. continue
  742. postKey = postStr.split('"', 1)[0]
  743. postValueStr = postStr.split('"', 1)[1]
  744. if ';' in postValueStr:
  745. continue
  746. if '\r\n' not in postValueStr:
  747. continue
  748. postLines = postValueStr.split('\r\n')
  749. postValue = ''
  750. if len(postLines) > 2:
  751. for line in range(2, len(postLines)-1):
  752. if line > 2:
  753. postValue += '\n'
  754. postValue += postLines[line]
  755. fields[postKey] = postValue
  756. return fields