In [ ]:
import spacy
import os
import nltk
from nltk.corpus import wordnet as wn


#nlp = spacy.cli.download("en_core_web_lg")
nlp = spacy.load("en_core_web_lg")
In [ ]:
collPath = '../transcriptFiles'
for file in os.listdir(collPath):
    if file.endswith(".txt"):
        filepath = f"{collPath}/{file}"
        name, extension = os.path.splitext(file)
        print(name)
        with open(filepath, 'r', encoding='utf8') as f:
            readFile = f.read()
            lengthFile = len(readFile)
            print(lengthFile)
# We're just printing out the filepaths and lengths of the files as a "smoke test" to see if we're reading the files. 
In [ ]:
# SURVEY IT ALL! :-) (Yeah, lots of data...)
for file in os.listdir(collPath):
    if file.endswith(".txt"):
        filepath = f"{collPath}/{file}"
        name, extension = os.path.splitext(file)
        print(name)
        with open(filepath, 'r', encoding='utf8') as f:
            readFile = f.read()
            spacyRead = nlp(readFile)
            for token in spacyRead:
                print(token.text, "---->", token.pos_, ":::::", token.lemma_)    
In [ ]:
#Create tables from txt files basic layout

import pandas as pd

def wordCollector(words, unit):
    wordList = []
    nodeAtts = []
    synsetCounts = []
    unitList = []
    
    for token in words:
        if token.pos_ == "VERB": #change ADJ to different word types, PROPN to get some of the char names, NOUN, VERB
            synsets = len(wn.synsets(token.lemma_))
            wordList.append(token.lemma_)
            nodeAtts.append(token.pos_)
            synsetCounts.append(synsets)
            unitList.append(unit)

    data = {
        'word': wordList,
        'nodeType': nodeAtts,
        'synsetCount': synsetCounts,
        'unit': unitList
    }
    #df = pd.DataFrame(data)
    #return df
    # This is returning a separate dataframe for every source text file. 

    df = pd.DataFrame(data)
    # ebb: Let's simplify the TSV data: 
    # We want one line for each distinct word, and to COUNT the word occurrences in line with the other info on its synsets and unit. 
    # groupby helps with this in pandas:
    df = (df.groupby(['word', 'nodeType', 'synsetCount', 'unit'], as_index=False)
            .size()   # ebb: .size() will deliver the number of times a word is used in the file!
            .rename(columns={'size': 'count'})) # ebb: This gives you a new column for the count of the words in the file.
        # ebb: We can use that for the edge thickness: count of the times a word is used. Should simplify your network visualization!
    return df
    # This is returning a separate dataframe for every source text file. 

# We need to consolidate all the dataframes into one file. Collect all dataframes here!
allDataFrames = []

for file in os.listdir(collPath):
    if file.endswith(".txt") and file.startswith("01x"): #you can add and to file.startswith to select certain seasons.
        filepath = f"{collPath}/{file}"
        name, extension = os.path.splitext(file)
        with open(filepath, 'r', encoding='utf8') as f:
            readFile = f.read()
            spacyRead = nlp(readFile)
            myDataFrame = wordCollector(spacyRead, name)
            # Add each individual dataframe as it comes out into the list of dataframes!
            allDataFrames.append(myDataFrame)

# Make an output filepath
outputFilePath = 'spyShortenedSeason1VerbData.tsv'
# Turn the list of dataframes into one dataframe:
fullDataFrame = pd.concat(allDataFrames, ignore_index=True)
# print(fullDataFrame)

# Note, since Pandas knows how to open and write files line by line, we can skip that open() step we used last time.
fullDataFrame.to_csv(outputFilePath, sep='\t', index=False)
print('I just saved a dataframe as a TSV file.')
# Go check your filestash for the file. 

Episode scrapper

In [ ]:
import bs4
import requests
import re  # this lets us do regex replacements
import os
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup

archive_url = "https://transcripts.foreverdreaming.org/viewforum.php?f=1524"   
output_folder = "../transcriptFiles"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate',
    'Connection': 'keep-alive',
}
# headers = {
#     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
# }

response = requests.get(archive_url, headers=headers)
print(response.status_code)  # Returns 200 if successful

print(response.text)

async with async_playwright() as p:
    browser = await p.chromium.launch(headless=True)
    page = await browser.new_page()
    
    # Replace with your actual URL
    await page.goto(archive_url)
    
    # Wait for the specific selector to ensure the page is loaded
    await page.wait_for_selector("a.topictitle", timeout=15000)
    
    html = await page.content()
    await browser.close()

# Now you can parse with BeautifulSoup as usual
soup = BeautifulSoup(html, "html.parser")
links = soup.find_all("a", class_="topictitle")

for link in links:
    print(link.get_text(), link.get('href'))

# Scraping Episodes
async def get_transcript_text(browser, url):
    page = await browser.new_page()
    await page.goto(url)
    
    # Wait for the main post body to load (Forever Dreaming uses .postbody)
    await page.wait_for_selector(".postbody", timeout=10000)
    
    html = await page.content()
    await page.close() # Close the tab, but keep the browser open
    
    soup = BeautifulSoup(html, "html.parser")
    # Forever Dreaming transcript text is inside div.content
    content = soup.find("div", class_="content")
    return content.get_text(separator="\n") if content else ""

# Launch the Scraper
async def corpus_build(archive_url, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        
        print(f"Fetching index page: {archive_url}")
        await page.goto(archive_url)
        await page.wait_for_selector("a.topictitle")
        
        html = await page.content()
        soup = BeautifulSoup(html, "html.parser")
        
        # Find all transcript links
        links = soup.find_all("a", class_="topictitle")
        
        for index, link in enumerate(links):
            raw_title = link.get_text().strip().replace("/", "-") # Here we replace `/` that could be a path separator 
            title = re.sub(r'[^\w\-]', '_', raw_title)   # We're cleaning up the title with a regex, 
             # anything that isn't alphanumeric gets converted to an underscore.
            relative_url = link.get('href').lstrip('.')
            full_url = f"https://transcripts.foreverdreaming.org{relative_url}"
            
            print(f"Scraping ({index+1}/{len(links)}): {title}")
            
            try:
                text = await get_transcript_text(browser, full_url)
                
                # Write to file
                file_path = os.path.join(output_folder, f"{title}.txt")
                with open(file_path, "w", encoding="utf-8") as f:
                    f.write(text)
            except Exception as e:
                print(f"Failed to scrape {title}: {e}")

        await browser.close()
        print("Finished! Corpus saved.")

# Initiate the process by running corpus_build()

await corpus_build(
    archive_url, 
    output_folder
)