Source code for pyconversations.message.chan

import html
import re
from datetime import datetime

from .base import UniMessage


[docs]class ChanPost(UniMessage): """ 4chan post object with additional 4chan-specific features """ CLASS_STR = '4chanPost' MENTION_REGEX = r'>>(\d+)' def __init__(self, **kwargs): kwargs['platform'] = '4chan' super(ChanPost, self).__init__(**kwargs)
[docs] @staticmethod def parse_datestr(x): """ Static method that specifies how to convert the native datetime string into a a Python datetime object. Parameters ---------- x : str The raw datetime string Returns ------- datetime.datetime Creation datetime of 4chan timestamp """ return datetime.fromtimestamp(float(x))
[docs] @staticmethod def exclude_replies(comment): """ Function to remove quotes from a reply and return reference to the posts that were replied to Parameters ---------- comment : str Text from 4chan comment Returns ------- 2-tuple(str, list(int)) The comment and the list of identified references """ refs = re.findall(r'>>(\d+)', comment) return comment, refs
[docs] @staticmethod def clean_text(comment): """ Cleans the raw HTML of a cached 4chan post, returning both the references and teh comment itself Parameters ---------- comment : str The raw text of a 4chan comment Returns ------- 2-tuple(str, list(int)) The comment text and a list of identified reference IDs """ comment = html.unescape(comment) comment = re.sub(r"<br/?>", "\n", comment) comment = re.sub(r"<wbr/?>", "", comment) comment, rfs = ChanPost.exclude_replies(comment) comment = str(comment).strip() return comment, rfs
[docs] @staticmethod def parse_raw(data, lang_detect=False): """ Static method that must be implemented by all non-abstract child classes. Concrete implementations should specify how to parse the raw data into this object. Parameters ---------- data : JSON/dict The raw data to be pre-processed. lang_detect : bool A boolean which specifies whether language detection should be activated. (Default: False) Returns ------- ChanPost The parsed post """ if 'com' not in data: return txt, rfs = ChanPost.clean_text(data['com']) reps = {int(data['resto'])} if data['resto'] else set() reps |= set([int(x) for x in rfs]) if int(data['no']) in reps: reps.remove(int(data['no'])) return ChanPost(**{ 'uid': int(data['no']), 'created_at': datetime.fromtimestamp(data['time']), 'text': txt, 'author': data['name'] if 'name' in data else None, 'reply_to': reps, 'lang_detect': lang_detect })