Source code for pyconversations.message.twitter
import re
from datetime import datetime
from .base import UniMessage
[docs]class Tweet(UniMessage):
"""
Twitter post object with additional Twitter-specific features
"""
MENTION_REGEX = r'(^|[^@\w])@(\w{1,15})\b'
CLASS_STR = 'Tweet'
def __init__(self, **kwargs):
kwargs['platform'] = 'Twitter'
super(Tweet, self).__init__(**kwargs)
[docs] @staticmethod
def parse_datestr(x):
"""
Static method that specifies how to convert the native datetime string
into a a Python datetime object.
Parameters
----------
x
The raw datetime string
"""
return datetime.strptime(x, '%a %b %d %H:%M:%S +0000 %Y')
[docs] def get_mentions(self):
"""
Uses Twitter specific regex to attempt to identify
user mentions within the comment text.
Returns a set of usernames.
"""
# twitter mention regex
names = re.findall(self.MENTION_REGEX, self.text)
names = [name[1] for name in names]
return super(Tweet, self).get_mentions() | set(names)
[docs] @staticmethod
def parse_raw(data, lang_detect=False):
"""
Static method that must be implemented by all non-abstract child classes.
Concrete implementations should specify how to parse the raw data into this object.
Returns a list of Twitter posts.
Parameters
----------
data
The raw data to be pre-processed.
lang_detect
A boolean which specifies whether language detection should be activated. (Default: False)
"""
cons_vals = {
'reply_to': set(),
'lang_detect': lang_detect
}
out = []
ignore_keys = {
'id_str', 'truncated', 'display_text_range', 'entities', 'source',
'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str',
'in_reply_to_screen_name', 'geo', 'coordinates', 'place', 'contributors',
'is_quote_status', 'retweet_count', 'favorite_count', 'favorited',
'retweeted', 'metadata', 'extended_entities', 'possibly_sensitive',
'quoted_status_id_str', 'quoted_status_permalink', 'withheld_in_countries',
'in_reply_to_status_created_at', 'possibly_sensitive_appealable', 'scopes',
'withheld_scope', 'withheld_copyright',
'filter_level', 'trends',
}
for key, value in data.items():
if key in ignore_keys:
continue
if key == 'created_at':
cons_vals['created_at'] = Tweet.parse_datestr(value)
elif key == 'id':
cons_vals['uid'] = value
elif key == 'full_text':
cons_vals['text'] = value
elif key == 'text' and 'text' not in cons_vals:
cons_vals['text'] = value
elif key == 'lang':
cons_vals['lang'] = value
elif key == 'in_reply_to_status_id':
cons_vals['reply_to'].add(value)
elif key == 'quoted_status_id':
cons_vals['reply_to'].add(value)
elif key == 'user':
cons_vals['author'] = value['screen_name']
elif key == 'quoted_status':
out.extend(Tweet.parse_raw(value))
else:
raise KeyError(f'Tweet:parse_raw - Unrecognized key: {key} --> {value}')
# Do entities last
if 'entities' in data:
ignore_keys = {
'hashtags', 'symbols', 'user_mentions', 'trends'
}
for key, value in data['entities'].items():
if key in ignore_keys:
continue
if key == 'media':
for v in value:
cons_vals['text'] = re.sub(v['url'], v['display_url'], cons_vals.get('text', ''))
elif key == 'urls':
for v in value:
cons_vals['text'] = re.sub(v['url'], v['expanded_url'], cons_vals.get('text', ''))
else:
raise KeyError(f'Tweet:parse_raw (entities) - Unrecognized key: {key} --> {value}')
if 'text' in cons_vals and cons_vals['text']:
out.append(Tweet(**cons_vals))
return out