{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "9b77f851",
   "metadata": {},
   "outputs": [],
   "source": [
    "import spacy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "b0aca9b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "cbb6dae4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the spaCy language model\n",
    "nlp = spacy.load(\"en_core_web_sm\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "614fe09f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Increase the max_length limit to accommodate your text\n",
    "nlp.max_length = 2000000  # Set it to a value greater than the length of your text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "803cd9ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "import PyPDF2\n",
    "\n",
    "# Open the PDF file\n",
    "pdf_file_path = \"yourfile.pdf\" # replace with path the pdf file you are analyzing\n",
    "pdf_file = open(pdf_file_path, 'rb')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "aa8b87ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a PDF reader object\n",
    "pdf_reader = PyPDF2.PdfFileReader(pdf_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "93ef84a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize an empty string to store the extracted text\n",
    "extracted_text = \"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "19d9e19b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Loop through each page of the PDF and extract text\n",
    "for page_num in range(pdf_reader.numPages):\n",
    "    page = pdf_reader.getPage(page_num)\n",
    "    page_text = page.extractText()\n",
    "    extracted_text += page_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "6c8c8875",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Close the PDF file\n",
    "pdf_file.close()\n",
    "# Now, 'extracted_text' contains the text content of the entire PDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "2f2706bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize variables to track character and word count\n",
    "character_names = [\"Rand\", \"Moiraine\", \"Egwene\", \"Perrin\", \"Nynaeve\"]  # Replace with the character names in your text\n",
    "word_counts = defaultdict(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "ea59f104",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Process the text with spaCy\n",
    "doc = nlp(extracted_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "11e26683",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Loop through each entity recognized as a character name\n",
    "for ent in doc.ents:\n",
    "    if ent.text in character_names:\n",
    "        character_name = ent.text\n",
    "\n",
    "        # Find the sentence containing the character's name\n",
    "        sentence = ent.sent\n",
    "\n",
    "        # Extract the dialogue (excluding the character name)\n",
    "        dialogue = sentence.text.replace(character_name, \"\").strip()\n",
    "\n",
    "        # Split the dialogue into words and count them\n",
    "        words = dialogue.split()\n",
    "        word_count = len(words)\n",
    "        word_counts[character_name] += word_count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "6c7fc5f5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total words spoken by Rand: 17407\n",
      "Total words spoken by Egwene: 7512\n",
      "Total words spoken by Nynaeve: 4583\n",
      "Total words spoken by Moiraine: 9495\n",
      "Total words spoken by Perrin: 960\n"
     ]
    }
   ],
   "source": [
    "# Print the word counts for each character\n",
    "for character_name, count in word_counts.items():\n",
    "    print(f\"Total words spoken by {character_name}: {count}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3cda3e88",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}