Source code for pyconversations.reader.facebook

import json
from glob import glob

from tqdm import tqdm

from ..convo import Conversation
from ..message import FBPost
from .base import BaseReader


[docs]class RawFBReader(BaseReader):

    """
    Reader for raw FB data
    """

[docs]    @staticmethod
    def read(path_pattern):
        """
        Function for reading an entire file/directory of conversations.

        Parameters
        ----------
        path_pattern : str
            The path to file or directory containing Conversation data

        Raises
        ------
        NotImplementedError
        """
        raise NotImplementedError

[docs]    @staticmethod
    def iter_read(path_pattern, ld=True):
        """
        Given a `path_pattern` that points to a directory containing raw FB data
        in the form of `path_pattern/PAGES/RAW_DATA.json`,
        this function will iteratively read the files and produce Conversational data.

        Parameters
        ----------
        path_pattern : str
            The path to file or directory containing Conversation data
        ld : bool
            Whether or not language detection should be activated. (Default: True)

        Yields
        ------
        2-tuple(str, Conversation)
            The name of the page (as parsed) and an associated Conversation from that page

        Raises
        ------
        ValueError
            If a JSON file is encountered that isn't named as one of: `post`, `comments`, `replies`, `attach`, `react`, `scrape`
        """
        # gather all page names
        pagenames = set()
        for f in glob(path_pattern + '*/'):
            pgname = f.split('/')[-2]
            pagenames.add(pgname)

        for pagename in pagenames:
            page = Conversation()
            for post_path in tqdm(glob(path_pattern + f'{pagename}/*')):
                pid = post_path.split('/')[-1]

                post_del = True
                for f in glob(f'{post_path}/post*.json'):
                    try:
                        post = FBPost.parse_raw(json.load(open(f)), post_type='post', in_reply_to=pagename, lang_detect=ld)

                        if not post:
                            continue

                        pid = post.uid
                        page.add_post(post)
                        post_del = False
                    except json.JSONDecodeError:
                        continue

                # if a post is deleted, let's just create a top-level mock post
                # to keep comments centrally grouped...
                if post_del:
                    post = FBPost(uid=pid, text='[deleted]', author=pagename, platform='Facebook', lang='en')
                    page.add_post(post)

                for f in glob(f'{post_path}/*.json'):
                    if 'post' in f:
                        continue
                    elif 'comments' in f:
                        try:
                            for x in FBPost.parse_raw(json.load(open(f)), post_type='comments', in_reply_to=pid, lang_detect=ld):
                                page.add_post(x)
                        except json.JSONDecodeError:
                            continue
                    elif 'replies' in f:
                        try:
                            for x in FBPost.parse_raw(json.load(open(f)), post_type='replies', in_reply_to=pid, lang_detect=ld):
                                page.add_post(x)
                        except json.JSONDecodeError:
                            # File is corrupt, skip
                            pass
                    elif 'attach' in f:
                        pass
                    elif 'react' in f:
                        pass
                    elif 'scrape' in f:
                        pass
                    else:
                        raise ValueError(f'RawFB::iter_read - Unrecognized file: {f}')

            yield pagename, page.segment()