content.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532
  1. __filename__ = "content.py"
  2. __author__ = "Bob Mottram"
  3. __license__ = "AGPL3+"
  4. __version__ = "1.1.0"
  5. __maintainer__ = "Bob Mottram"
  6. __email__ = "bob@freedombone.net"
  7. __status__ = "Production"
  8. import os
  9. import time
  10. import email.parser
  11. from shutil import copyfile
  12. from utils import loadJson
  13. def replaceEmojiFromTags(content: str,tag: [],messageType: str) -> str:
  14. """Uses the tags to replace :emoji: with html image markup
  15. """
  16. for tagItem in tag:
  17. if not tagItem.get('type'):
  18. continue
  19. if tagItem['type']!='Emoji':
  20. continue
  21. if not tagItem.get('name'):
  22. continue
  23. if not tagItem.get('icon'):
  24. continue
  25. if not tagItem['icon'].get('url'):
  26. continue
  27. if tagItem['name'] not in content:
  28. continue
  29. htmlClass='emoji'
  30. if messageType=='post header':
  31. htmlClass='emojiheader'
  32. if messageType=='profile':
  33. htmlClass='emojiprofile'
  34. emojiHtml="<img src=\""+tagItem['icon']['url']+"\" alt=\""+tagItem['name'].replace(':','')+"\" align=\"middle\" class=\""+htmlClass+"\"/>"
  35. content=content.replace(tagItem['name'],emojiHtml)
  36. return content
  37. def addMusicTag(content: str,tag: str) -> str:
  38. """If a music link is found then ensure that the post is tagged appropriately
  39. """
  40. if '#' not in tag:
  41. tag='#'+tag
  42. if tag in content:
  43. return content
  44. musicSites=['soundcloud.com','bandcamp.com']
  45. musicSiteFound=False
  46. for site in musicSites:
  47. if site+'/' in content:
  48. musicSiteFound=True
  49. break
  50. if not musicSiteFound:
  51. return content
  52. return ':music: '+content+' '+tag+' '
  53. def addWebLinks(content: str) -> str:
  54. """Adds markup for web links
  55. """
  56. if not ('https://' in content or 'http://' in content):
  57. return content
  58. maxLinkLength=40
  59. words=content.replace('\n',' --linebreak-- ').split(' ')
  60. replaceDict={}
  61. for w in words:
  62. if w.startswith('https://') or \
  63. w.startswith('http://') or \
  64. w.startswith('dat://'):
  65. if w.endswith('.') or w.endswith(';'):
  66. w=w[:-1]
  67. markup='<a href="'+w+'" rel="nofollow noopener" target="_blank">'
  68. if w.startswith('https://'):
  69. markup+='<span class="invisible">https://</span>'
  70. elif w.startswith('http://'):
  71. markup+='<span class="invisible">http://</span>'
  72. elif w.startswith('dat://'):
  73. markup+='<span class="invisible">dat://</span>'
  74. linkText=w.replace('https://','').replace('http://','').replace('dat://','')
  75. # prevent links from becoming too long
  76. if len(linkText)>maxLinkLength:
  77. markup+='<span class="ellipsis">'+linkText[:maxLinkLength]+'</span>'
  78. markup+='<span class="invisible">'+linkText[maxLinkLength:]+'</span></a>'
  79. else:
  80. markup+='<span class="ellipsis">'+linkText+'</span></a>'
  81. replaceDict[w]=markup
  82. for url,markup in replaceDict.items():
  83. content=content.replace(url,markup)
  84. content=content.replace(' --linebreak-- ','<br>')
  85. return content
  86. def validHashTag(hashtag: str) -> bool:
  87. """Returns true if the give hashtag contains valid characters
  88. """
  89. validChars = set('0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')
  90. if set(hashtag).issubset(validChars):
  91. return True
  92. return False
  93. def addHashTags(wordStr: str,httpPrefix: str,domain: str, \
  94. replaceHashTags: {},postHashtags: {}) -> bool:
  95. """Detects hashtags and adds them to the replacements dict
  96. Also updates the hashtags list to be added to the post
  97. """
  98. if replaceHashTags.get(wordStr):
  99. return True
  100. hashtag=wordStr[1:]
  101. if not validHashTag(hashtag):
  102. return False
  103. hashtagUrl=httpPrefix+"://"+domain+"/tags/"+hashtag
  104. postHashtags[hashtag]= {
  105. 'href': hashtagUrl,
  106. 'name': '#'+hashtag,
  107. 'type': 'Hashtag'
  108. }
  109. replaceHashTags[wordStr]= \
  110. "<a href=\""+hashtagUrl+"\" class=\"mention hashtag\" rel=\"tag\">#<span>"+hashtag+"</span></a>"
  111. return True
  112. def loadEmojiDict(emojiDataFilename: str,emojiDict: {}) -> None:
  113. """Creates an emoji dictionary based on emoji/emoji-data.txt
  114. """
  115. if not os.path.isfile(emojiDataFilename):
  116. return
  117. with open (emojiDataFilename, "r") as fileHandler:
  118. for line in fileHandler:
  119. if len(line)<5:
  120. continue
  121. if line.startswith('#'):
  122. continue
  123. if '; Emoji' not in line:
  124. continue
  125. if ')' not in line:
  126. continue
  127. emojiUnicode=line.split(' ')[0]
  128. if len(emojiUnicode)<4:
  129. continue
  130. if '..' in emojiUnicode:
  131. emojiUnicode=emojiUnicode.split('..')[0]
  132. emojiName=line.split(')',1)[1].strip().replace('\n','').replace(' ','').replace('-','')
  133. if '..' in emojiName:
  134. emojiName=emojiName.split('..')[0]
  135. emojiDict[emojiName.lower()]=emojiUnicode
  136. def addEmoji(baseDir: str,wordStr: str,httpPrefix: str,domain: str,replaceEmoji: {},postTags: {},emojiDict: {}) -> bool:
  137. """Detects Emoji and adds them to the replacements dict
  138. Also updates the tags list to be added to the post
  139. """
  140. if not wordStr.startswith(':'):
  141. return False
  142. if not wordStr.endswith(':'):
  143. return False
  144. if len(wordStr)<3:
  145. return False
  146. if replaceEmoji.get(wordStr):
  147. return True
  148. # remove leading and trailing : characters
  149. emoji=wordStr[1:]
  150. emoji=emoji[:-1]
  151. # is the text of the emoji valid?
  152. if not validHashTag(emoji):
  153. return False
  154. if not emojiDict.get(emoji):
  155. return False
  156. emojiFilename=baseDir+'/emoji/'+emojiDict[emoji]+'.png'
  157. if not os.path.isfile(emojiFilename):
  158. return False
  159. emojiUrl=httpPrefix+"://"+domain+"/emoji/"+emojiDict[emoji]+'.png'
  160. postTags[emoji]= {
  161. 'icon': {
  162. 'mediaType': 'image/png',
  163. 'type': 'Image',
  164. 'url': emojiUrl
  165. },
  166. 'name': ':'+emoji+':',
  167. 'type': 'Emoji'
  168. }
  169. return True
  170. def addMention(wordStr: str,httpPrefix: str,following: str,replaceMentions: {},recipients: [],tags: {}) -> bool:
  171. """Detects mentions and adds them to the replacements dict and recipients list
  172. """
  173. possibleHandle=wordStr[1:]
  174. # @nick
  175. if following and '@' not in possibleHandle:
  176. # fall back to a best effort match against the following list
  177. # if no domain was specified. eg. @nick
  178. possibleNickname=possibleHandle
  179. for follow in following:
  180. if follow.startswith(possibleNickname+'@'):
  181. replaceDomain=follow.replace('\n','').split('@')[1]
  182. recipientActor=httpPrefix+"://"+replaceDomain+"/users/"+possibleNickname
  183. if recipientActor not in recipients:
  184. recipients.append(recipientActor)
  185. tags[wordStr]={
  186. 'href': recipientActor,
  187. 'name': wordStr,
  188. 'type': 'Mention'
  189. }
  190. replaceMentions[wordStr]="<span class=\"h-card\"><a href=\""+httpPrefix+"://"+replaceDomain+"/@"+possibleNickname+"\" class=\"u-url mention\">@<span>"+possibleNickname+"</span></a></span>"
  191. return True
  192. return False
  193. possibleNickname=None
  194. possibleDomain=None
  195. if '@' not in possibleHandle:
  196. return False
  197. possibleNickname=possibleHandle.split('@')[0]
  198. if not possibleNickname:
  199. return False
  200. possibleDomain=possibleHandle.split('@')[1].strip('\n')
  201. if not possibleDomain:
  202. return False
  203. if following:
  204. for follow in following:
  205. if follow.replace('\n','')!=possibleHandle:
  206. continue
  207. recipientActor=httpPrefix+"://"+possibleDomain+"/users/"+possibleNickname
  208. if recipientActor not in recipients:
  209. recipients.append(recipientActor)
  210. tags[wordStr]={
  211. 'href': recipientActor,
  212. 'name': wordStr,
  213. 'type': 'Mention'
  214. }
  215. replaceMentions[wordStr]="<span class=\"h-card\"><a href=\""+httpPrefix+"://"+possibleDomain+"/@"+possibleNickname+"\" class=\"u-url mention\">@<span>"+possibleNickname+"</span></a></span>"
  216. return True
  217. # @nick@domain
  218. if not (possibleDomain=='localhost' or '.' in possibleDomain):
  219. return False
  220. recipientActor=httpPrefix+"://"+possibleDomain+"/users/"+possibleNickname
  221. if recipientActor not in recipients:
  222. recipients.append(recipientActor)
  223. tags[wordStr]={
  224. 'href': recipientActor,
  225. 'name': wordStr,
  226. 'type': 'Mention'
  227. }
  228. replaceMentions[wordStr]="<span class=\"h-card\"><a href=\""+httpPrefix+"://"+possibleDomain+"/@"+possibleNickname+"\" class=\"u-url mention\">@<span>"+possibleNickname+"</span></a></span>"
  229. return True
  230. def removeLongWords(content: str,maxWordLength: int,longWordsList: []) -> str:
  231. """Breaks up long words so that on mobile screens this doesn't disrupt the layout
  232. """
  233. if ' ' not in content:
  234. # handle a single very long string with no spaces
  235. contentStr=content.replace('<p>','').replace('<\p>','')
  236. if '://' not in contentStr:
  237. if len(contentStr)>maxWordLength:
  238. if '<p>' in content:
  239. content='<p>'+contentStr[:maxWordLength]+'<\p>'
  240. else:
  241. content=content[:maxWordLength]
  242. return content
  243. words=content.split(' ')
  244. if not longWordsList:
  245. longWordsList=[]
  246. for wordStr in words:
  247. if len(wordStr)>maxWordLength:
  248. if wordStr not in longWordsList:
  249. longWordsList.append(wordStr)
  250. for wordStr in longWordsList:
  251. if wordStr.startswith('<'):
  252. continue
  253. if '=\"' in wordStr:
  254. continue
  255. if '@' in wordStr:
  256. if '@@' not in wordStr:
  257. continue
  258. if '=.ed25519' in wordStr:
  259. continue
  260. if '.onion' in wordStr:
  261. continue
  262. if '.i2p' in wordStr:
  263. continue
  264. if 'https:' in wordStr:
  265. continue
  266. elif 'http:' in wordStr:
  267. continue
  268. elif 'dat:' in wordStr:
  269. continue
  270. if '<' in wordStr:
  271. replaceWord=wordStr.split('<',1)[0]
  272. content= \
  273. content.replace(wordStr,replaceWord)
  274. wordStr=replaceWord
  275. if '/' in wordStr:
  276. continue
  277. if len(wordStr[maxWordLength:])<maxWordLength:
  278. content= \
  279. content.replace(wordStr, \
  280. wordStr[:maxWordLength]+'\n'+ \
  281. wordStr[maxWordLength:])
  282. else:
  283. content= \
  284. content.replace(wordStr, \
  285. wordStr[:maxWordLength])
  286. if content.startswith('<p>'):
  287. if not content.endswith('</p>'):
  288. content=content.strip()+'</p>'
  289. return content
  290. def addHtmlTags(baseDir: str,httpPrefix: str, \
  291. nickname: str,domain: str,content: str, \
  292. recipients: [],hashtags: {},isJsonContent=False) -> str:
  293. """ Replaces plaintext mentions such as @nick@domain into html
  294. by matching against known following accounts
  295. """
  296. if content.startswith('<p>'):
  297. return content
  298. maxWordLength=40
  299. content=content.replace('\n',' --linebreak-- ')
  300. content=addMusicTag(content,'nowplaying')
  301. words=content.replace(',',' ').replace(';',' ').split(' ')
  302. # remove . for words which are not mentions
  303. wordCtr=0
  304. newWords=[]
  305. for wordIndex in range(0,len(words)):
  306. wordStr=words[wordIndex]
  307. if wordStr.endswith('.'):
  308. if not wordStr.startswith('@'):
  309. wordStr=wordStr[:-1]
  310. if wordStr.startswith('.'):
  311. wordStr=wordStr[1:]
  312. newWords.append(wordStr)
  313. words=newWords
  314. replaceMentions={}
  315. replaceHashTags={}
  316. replaceEmoji={}
  317. emojiDict={}
  318. originalDomain=domain
  319. if ':' in domain:
  320. domain=domain.split(':')[0]
  321. followingFilename=baseDir+'/accounts/'+nickname+'@'+domain+'/following.txt'
  322. # read the following list so that we can detect just @nick
  323. # in addition to @nick@domain
  324. following=None
  325. if '@' in words:
  326. if os.path.isfile(followingFilename):
  327. with open(followingFilename, "r") as f:
  328. following = f.readlines()
  329. # extract mentions and tags from words
  330. longWordsList=[]
  331. for wordStr in words:
  332. wordLen=len(wordStr)
  333. if wordLen>2:
  334. if wordLen>maxWordLength:
  335. longWordsList.append(wordStr)
  336. firstChar=wordStr[0]
  337. if firstChar=='@':
  338. if addMention(wordStr,httpPrefix,following,replaceMentions,recipients,hashtags):
  339. continue
  340. elif firstChar=='#':
  341. if addHashTags(wordStr,httpPrefix,originalDomain,replaceHashTags,hashtags):
  342. continue
  343. elif ':' in wordStr:
  344. #print('TAG: emoji located - '+wordStr)
  345. wordStr2=wordStr.split(':')[1]
  346. if not emojiDict:
  347. # emoji.json is generated so that it can be customized and the changes
  348. # will be retained even if default_emoji.json is subsequently updated
  349. if not os.path.isfile(baseDir+'/emoji/emoji.json'):
  350. copyfile(baseDir+'/emoji/default_emoji.json',baseDir+'/emoji/emoji.json')
  351. emojiDict=loadJson(baseDir+'/emoji/emoji.json')
  352. #print('TAG: looking up emoji for :'+wordStr2+':')
  353. addEmoji(baseDir,':'+wordStr2+':',httpPrefix,originalDomain,replaceEmoji,hashtags,emojiDict)
  354. # replace words with their html versions
  355. for wordStr,replaceStr in replaceMentions.items():
  356. content=content.replace(wordStr,replaceStr)
  357. for wordStr,replaceStr in replaceHashTags.items():
  358. content=content.replace(wordStr,replaceStr)
  359. if not isJsonContent:
  360. for wordStr,replaceStr in replaceEmoji.items():
  361. content=content.replace(wordStr,replaceStr)
  362. content=addWebLinks(content)
  363. if longWordsList:
  364. content=removeLongWords(content,maxWordLength,longWordsList)
  365. content=content.replace(' --linebreak-- ','</p><p>')
  366. return '<p>'+content+'</p>'
  367. def getMentionsFromHtml(htmlText: str,matchStr="<span class=\"h-card\"><a href=\"") -> []:
  368. """Extracts mentioned actors from the given html content string
  369. """
  370. mentions=[]
  371. if matchStr not in htmlText:
  372. return mentions
  373. mentionsList=htmlText.split(matchStr)
  374. for mentionStr in mentionsList:
  375. if '"' not in mentionStr:
  376. continue
  377. actorStr=mentionStr.split('"')[0]
  378. if actorStr.startswith('http') or \
  379. actorStr.startswith('dat:'):
  380. if actorStr not in mentions:
  381. mentions.append(actorStr)
  382. return mentions
  383. def extractMediaInFormPOST(postBytes,boundary,name: str):
  384. """Extracts the binary encoding for image/video/audio within a http form POST
  385. Returns the media bytes and the remaining bytes
  386. """
  387. imageStartBoundary=b'Content-Disposition: form-data; name="'+name.encode('utf8', 'ignore')+b'";'
  388. imageStartLocation=postBytes.find(imageStartBoundary)
  389. if imageStartLocation==-1:
  390. return None,postBytes
  391. # bytes after the start boundary appears
  392. mediaBytes=postBytes[imageStartLocation:]
  393. # look for the next boundary
  394. imageEndBoundary=boundary.encode('utf8', 'ignore')
  395. imageEndLocation=mediaBytes.find(imageEndBoundary)
  396. if imageEndLocation==-1:
  397. # no ending boundary
  398. return mediaBytes,postBytes[:imageStartLocation]
  399. # remaining bytes after the end of the image
  400. remainder=mediaBytes[imageEndLocation:]
  401. # remove bytes after the end boundary
  402. mediaBytes=mediaBytes[:imageEndLocation]
  403. # return the media and the before+after bytes
  404. return mediaBytes,postBytes[:imageStartLocation]+remainder
  405. def saveMediaInFormPOST(mediaBytes,debug: bool, \
  406. filenameBase=None) -> (str,str):
  407. """Saves the given media bytes extracted from http form POST
  408. Returns the filename and attachment type
  409. """
  410. if not mediaBytes:
  411. if debug:
  412. print('DEBUG: No media found within POST')
  413. return None,None
  414. mediaLocation=-1
  415. searchStr=''
  416. filename=None
  417. # directly search the binary array for the beginning
  418. # of an image
  419. extensionList= {
  420. 'png': 'image/png',
  421. 'jpeg': 'image/jpeg',
  422. 'gif': 'image/gif',
  423. 'webp': 'image/webp',
  424. 'mp4': 'video/mp4',
  425. 'ogv': 'video/ogv',
  426. 'mp3': 'audio/mpeg',
  427. 'ogg': 'audio/ogg'
  428. }
  429. detectedExtension=None
  430. for extension,contentType in extensionList.items():
  431. searchStr=b'Content-Type: '+contentType.encode('utf8', 'ignore')
  432. mediaLocation=mediaBytes.find(searchStr)
  433. if mediaLocation>-1:
  434. if extension=='jpeg':
  435. extension='jpg'
  436. elif extension=='mpeg':
  437. extension='mp3'
  438. filename=filenameBase+'.'+extension
  439. attachmentMediaType= \
  440. searchStr.decode().split('/')[0].replace('Content-Type: ','')
  441. detectedExtension=extension
  442. break
  443. if not filename:
  444. return None,None
  445. # locate the beginning of the image, after any
  446. # carriage returns
  447. startPos=mediaLocation+len(searchStr)
  448. for offset in range(1,8):
  449. if mediaBytes[startPos+offset]!=10:
  450. if mediaBytes[startPos+offset]!=13:
  451. startPos+=offset
  452. break
  453. # remove any existing image files with a different format
  454. extensionTypes=('png','jpg','jpeg','gif','webp')
  455. for ex in extensionTypes:
  456. if ex==detectedExtension:
  457. continue
  458. possibleOtherFormat=filename.replace('.temp','').replace('.'+detectedExtension,'.'+ex)
  459. if os.path.isfile(possibleOtherFormat):
  460. os.remove(possibleOtherFormat)
  461. fd = open(filename, 'wb')
  462. fd.write(mediaBytes[startPos:])
  463. fd.close()
  464. return filename,attachmentMediaType
  465. def extractTextFieldsInPOST(postBytes,boundary,debug: bool) -> {}:
  466. """Returns a dictionary containing the text fields of a http form POST
  467. The boundary argument comes from the http header
  468. """
  469. msg = email.parser.BytesParser().parsebytes(postBytes)
  470. if debug:
  471. print('DEBUG: POST arriving '+msg.get_payload(decode=True).decode('utf-8'))
  472. messageFields=msg.get_payload(decode=True).decode('utf-8').split(boundary)
  473. fields={}
  474. # examine each section of the POST, separated by the boundary
  475. for f in messageFields:
  476. if f=='--':
  477. continue
  478. if ' name="' not in f:
  479. continue
  480. postStr=f.split(' name="',1)[1]
  481. if '"' not in postStr:
  482. continue
  483. postKey=postStr.split('"',1)[0]
  484. postValueStr=postStr.split('"',1)[1]
  485. if ';' in postValueStr:
  486. continue
  487. if '\r\n' not in postValueStr:
  488. continue
  489. postLines=postValueStr.split('\r\n')
  490. postValue=''
  491. if len(postLines)>2:
  492. for line in range(2,len(postLines)-1):
  493. if line>2:
  494. postValue+='\n'
  495. postValue+=postLines[line]
  496. fields[postKey]=postValue
  497. return fields