import re
from datetime import datetime
from .base import UniMessage
[docs]class RedditPost(UniMessage):
"""
Reddit post object with additional Reddit-specific features
"""
MENTION_REGEX = r'(^|[^\w])/?u/([A-Za-z0-9_-]+)\b'
CLASS_STR = 'RedditPost'
def __init__(self, **kwargs):
kwargs['platform'] = 'Reddit'
super(RedditPost, self).__init__(**kwargs)
[docs] @staticmethod
def parse_datestr(x):
"""
Static method that specifies how to convert the native datetime string
into a a Python datetime object.
Parameters
----------
x : str
The raw datetime string
Returns
-------
datetime.datetime
The parsed datetime
"""
return datetime.fromtimestamp(float(x))
[docs] def get_mentions(self):
"""
Uses Reddit specific regex to attempt to identify
user mentions within the comment text.
Returns a set of usernames.
Returns
-------
set(str)
The set of extracted usernames
"""
# Reddit mention regex
names = re.findall(self.MENTION_REGEX, self.text)
names = [n[1] for n in names]
return super(RedditPost, self).get_mentions() | set(names)
[docs] @staticmethod
def parse_raw(data, lang_detect=False):
"""
Static method that must be implemented by all non-abstract child classes.
Concrete implementations should specify how to parse the raw data into this object.
Parameters
----------
data : JSON/dict
The raw data to be pre-processed.
lang_detect : bool
A boolean which specifies whether language detection should be activated. (Default: False)
Returns
-------
list(RedditPost)
The parsed post
"""
if 'kind' in data and 'data' in data: # and data['kind'] == 'Listing':
if data['kind'] == 'Listing':
return [x for child in data['data']['children'] for x in
RedditPost.parse_raw(child['data'], lang_detect=lang_detect)]
else:
return [x for x in RedditPost.parse_raw(data['data'], lang_detect=lang_detect)]
if type(data) == list:
return [RedditPost.parse_raw(x, lang_detect=lang_detect) for x in data if type(x) == dict]
post_cons = {
'reply_to': set(),
'lang_detect': lang_detect
}
ignore_keys = {
'approved_at_utc',
'approved_by',
'archived',
# 'author',
'author_cakeday',
'author_flair_background_color',
'author_flair_css_class',
'author_flair_richtext',
'author_flair_template_id',
'author_flair_text',
'author_flair_text_color',
'author_flair_type',
# 'author_name',
'banned_at_utc',
'banned_by',
# 'body',
'body_html',
'can_gild',
'can_mod_post',
'category',
# 'children',
'clicked',
'collapsed',
'collapsed_reason',
'content_categories',
'contest_mode',
'controversiality',
'count',
# 'created',
# 'created_utc',
'delta',
'depth',
'distinguished',
'domain',
'downs',
'edited',
'gilded',
'hidden',
'hide_score',
# 'id',
'is_crosspostable',
'is_meta',
'is_original_content',
'is_reddit_media_domain',
'is_self',
'is_submitter',
'is_video',
'kind',
'likes',
'link_flair_background_color',
'link_flair_css_class',
'link_flair_richtext',
'link_flair_template_id',
'link_flair_text',
'link_flair_text_color',
'link_flair_type',
'link_id',
'locked',
'media',
'media_embed',
'media_only',
'mod_note',
'mod_reason_by',
'mod_reason_title',
'mod_reports',
# 'name',
'no_follow',
'num_comments',
'num_crossposts',
'num_reports',
'over_18',
# 'parent_id',
'parent_whitelist_status',
'permalink',
'pinned',
'post_categories',
'post_hint',
'preview',
'previous_visits',
'pwls',
'quarantine',
'removal_reason',
# 'replies',
'report_reasons',
'rte_mode',
'saved',
'score',
'score_hidden',
'secure_media',
'secure_media_embed',
# 'selftext',
'selftext_html',
'send_replies',
'spoiler',
'stickied',
'subreddit',
'subreddit_id',
'subreddit_name_prefixed',
'subreddit_subscribers',
'subreddit_type',
'suggested_sort',
'thumbnail',
'thumbnail_height',
'thumbnail_width',
# 'title',
'ups',
'upvote_ratio',
'url',
'user_reports',
'view_count',
'violated_rule',
'visited',
'whitelist_status',
'wls'
}
out = []
for key, value in data.items():
if key in ignore_keys:
continue
if key == 'author_name' or key == 'author':
post_cons['author'] = value
elif key == 'body' or key == 'selftext':
post_cons['text'] = post_cons['text'] + '\n' + value if 'text' in post_cons else value
elif key == 'title':
post_cons['text'] = value + '\n' + post_cons['text'] if 'text' in post_cons else value
elif key == 'created':
post_cons['created_at'] = RedditPost.parse_datestr(value)
elif key == 'created_utc':
if 'created' in data:
continue
post_cons['created_at'] = RedditPost.parse_datestr(value)
elif key == 'id':
post_cons['uid'] = value
elif key == 'name':
if 'id' not in data:
post_cons['uid'] = value
elif key == 'parent_id':
if re.search(r't\d_', value):
value = re.sub(r't\d_', '', value)
post_cons['reply_to'].add(value)
elif key == 'replies' or key == 'children':
if value:
out.extend(RedditPost.parse_raw(value, lang_detect=lang_detect))
else:
print(data.keys())
raise KeyError(f'RedditPost::parse_raw - Unrecognized key: {key} --> {value}')
if post_cons['uid'] != '_':
out.append(RedditPost(**post_cons))
return out
[docs] @staticmethod
def parse_rd(data, lang_detect=True):
"""
Secondary method for parsing raw Reddit data
Parameters
----------
data : JSON/dict
The raw data to be pre-processed.
lang_detect : bool
A boolean which specifies whether language detection should be activated. (Default: True)
Returns
-------
RedditPost
The parsed post
"""
cons = {
'lang_detect': lang_detect,
'uid': data['id'], # 't3_' + data['id'],
'author': data['author'],
'created_at': RedditPost.parse_datestr(data['created_utc']),
'tags': {f'board={data["subreddit"]}'}
}
if data['type'] == 'comment':
cons['text'] = data['body']
pid = data['parent_id']
for i in range(1, 6):
pid = pid.replace(f't{i}_', '')
cons['reply_to'] = {pid}
elif data['type'] == 'submission':
cons['text'] = data['title'] + ' ' + data['selftext']
cons['reply_to'] = set()
else:
raise ValueError(f'RedditPost::parse_rd -- Unrecognized type: {data}')
return RedditPost(**cons)