Source code for pyconversations.reader.reddit

import json
from datetime import datetime
from glob import glob

from tqdm import tqdm

from ..convo import Conversation
from ..message import RedditPost
from .base import BaseReader


[docs]class RedditReader(BaseReader):

    """
    General Reddit raw data reader.
    """

[docs]    @staticmethod
    def read(path_pattern):
        """
        Function for reading an entire file/directory of conversations.

        Parameters
        ----------
        path_pattern : str
            The path to file or directory containing Conversation data

        Raises
        ------
        NotImplementedError
        """
        raise NotImplementedError

[docs]    @staticmethod
    def iter_read(path_pattern, ld=True, rd=False):
        """
        This iterative reading function assumes that the path it will be pointed towards
        contains raw Reddit comments and submissions, sorted/chunked by the month they were created.

        Parameters
        ----------
        path_pattern : str
            The path to the directory containing the data
        ld : bool
            Whether or not activate language detection (Default: True)
        rd : bool
            Whether to use the secondary Reddit parser (`RedditPost.parse_rd`) or not (`RedditPost.parse_raw`) (Default: False)

        Yields
        ------
        list(Conversation)
            A chunk of Conversations, as parsed
        """
        convo = Conversation()
        for f in tqdm(sorted(glob(f'{path_pattern}*.json'))):
            if rd:
                with open(f) as fp:
                    for line in fp.readlines():
                        try:
                            data = json.loads(line)
                            convo.add_post(RedditPost.parse_rd(data, lang_detect=ld))
                        except json.decoder.JSONDecodeError:
                            if '}{' in line:
                                lxs = line.split('}{')
                                lx0, lxs = lxs[0], lxs[1:]
                                lx0 += '}'
                                lxs = [lx0] + ['{' + lx for lx in lxs]

                                for lx in lxs:
                                    convo.add_post(RedditPost.parse_rd(json.loads(lx), lang_detect=ld))
                            else:
                                print(line)
                                import pdb
                                pdb.set_trace()

                date_str = f.split('/')[-1][:7]
                dt = datetime.strptime(date_str, '%Y-%m')
                if dt.month in {1, 4, 7, 10}:
                    # dump all  posts older than 6 months
                    out = Conversation()
                    to_drop = set()
                    for uid, post in convo.posts.items():
                        out.add_post(post)
                        to_drop.add(uid)

                    for uid in to_drop:
                        convo.remove_post(uid)

                    out = out.segment()
                    yield out
            else:
                convo = Conversation()
                with open(f) as fp:
                    for line in fp.readlines():
                        convo.add_post(RedditPost.parse_raw(json.loads(line), lang_detect=ld))

                segs = convo.segment()
                yield segs

        if rd and convo.messages:
            segs = convo.segment()
            yield segs


[docs]class BNCReader(BaseReader):

    """
    A custom Reddit Reader generated for the data format
    from "Before Name-calling: Dynamics and Triggers of Ad Hominem Fallacies in Web Argumentation" (Habernal et al., 2018).

    Notes
    -----
    See: https://www.aclweb.org/anthology/N18-1036/
    """

[docs]    @staticmethod
    def read(path_pattern, ld=True):
        """
        Reads the entire archive of posts from this dataset.
        Posts that violate rule 2 of the r/ChangeMyView sub-reddit are tagged with the `AH=1` tag;
        otherwise, posts are tagged with `AH=0`.

        Parameters
        ----------
        path_pattern : str
            The path to the directory containing the data
        ld : bool
            Whether or not activate language detection (Default: True)

        Returns
        -------
        list(Conversation)
            A list of all parsed and segmented disjoint Conversations within this dataset
        """
        convo = Conversation()
        for f in tqdm(glob(path_pattern)):
            with open(f) as fp:
                for line in fp.readlines():
                    raw = json.loads(line)
                    post = RedditPost.parse_raw(raw, lang_detect=ld)
                    post.add_tag('AH=1' if raw["violated_rule"] == 2 else 'AH=0')
                    convo.add_post(post)

        segs = convo.segment()
        return segs

[docs]    @staticmethod
    def iter_read(path_pattern, ld=True, rd=False):
        """
        Function for creating a conversation reading iterator.
        Will read and parse part of a file/directory, yielding segments as queried.

        Parameters
        ----------
        path_pattern : str
            The path to file or directory containing Conversation data
        ld : bool
            Whether or not activate language detection (Default: True)
        rd : bool
            Whether to use the secondary Reddit parser (`RedditPost.parse_rd`) or not (`RedditPost.parse_raw`) (Default: False)

        Raises
        ------
        NotImplementedError
        """
        raise NotImplementedError