In [ ]:
import spacy
import os
import nltk
from nltk.corpus import wordnet as wn
#nlp = spacy.cli.download("en_core_web_lg")
nlp = spacy.load("en_core_web_lg")
In [ ]:
collPath = '../transcriptFiles'
for file in os.listdir(collPath):
if file.endswith(".txt"):
filepath = f"{collPath}/{file}"
name, extension = os.path.splitext(file)
print(name)
with open(filepath, 'r', encoding='utf8') as f:
readFile = f.read()
lengthFile = len(readFile)
print(lengthFile)
# We're just printing out the filepaths and lengths of the files as a "smoke test" to see if we're reading the files.
In [ ]:
# SURVEY IT ALL! :-) (Yeah, lots of data...)
for file in os.listdir(collPath):
if file.endswith(".txt"):
filepath = f"{collPath}/{file}"
name, extension = os.path.splitext(file)
print(name)
with open(filepath, 'r', encoding='utf8') as f:
readFile = f.read()
spacyRead = nlp(readFile)
for token in spacyRead:
print(token.text, "---->", token.pos_, ":::::", token.lemma_)
In [ ]:
#Create tables from txt files basic layout
import pandas as pd
def wordCollector(words, unit):
wordList = []
nodeAtts = []
synsetCounts = []
unitList = []
for token in words:
if token.pos_ == "VERB": #change ADJ to different word types, PROPN to get some of the char names, NOUN, VERB
synsets = len(wn.synsets(token.lemma_))
wordList.append(token.lemma_)
nodeAtts.append(token.pos_)
synsetCounts.append(synsets)
unitList.append(unit)
data = {
'word': wordList,
'nodeType': nodeAtts,
'synsetCount': synsetCounts,
'unit': unitList
}
#df = pd.DataFrame(data)
#return df
# This is returning a separate dataframe for every source text file.
df = pd.DataFrame(data)
# ebb: Let's simplify the TSV data:
# We want one line for each distinct word, and to COUNT the word occurrences in line with the other info on its synsets and unit.
# groupby helps with this in pandas:
df = (df.groupby(['word', 'nodeType', 'synsetCount', 'unit'], as_index=False)
.size() # ebb: .size() will deliver the number of times a word is used in the file!
.rename(columns={'size': 'count'})) # ebb: This gives you a new column for the count of the words in the file.
# ebb: We can use that for the edge thickness: count of the times a word is used. Should simplify your network visualization!
return df
# This is returning a separate dataframe for every source text file.
# We need to consolidate all the dataframes into one file. Collect all dataframes here!
allDataFrames = []
for file in os.listdir(collPath):
if file.endswith(".txt") and file.startswith("01x"): #you can add and to file.startswith to select certain seasons.
filepath = f"{collPath}/{file}"
name, extension = os.path.splitext(file)
with open(filepath, 'r', encoding='utf8') as f:
readFile = f.read()
spacyRead = nlp(readFile)
myDataFrame = wordCollector(spacyRead, name)
# Add each individual dataframe as it comes out into the list of dataframes!
allDataFrames.append(myDataFrame)
# Make an output filepath
outputFilePath = 'spyShortenedSeason1VerbData.tsv'
# Turn the list of dataframes into one dataframe:
fullDataFrame = pd.concat(allDataFrames, ignore_index=True)
# print(fullDataFrame)
# Note, since Pandas knows how to open and write files line by line, we can skip that open() step we used last time.
fullDataFrame.to_csv(outputFilePath, sep='\t', index=False)
print('I just saved a dataframe as a TSV file.')
# Go check your filestash for the file.
Episode scrapper
In [ ]:
import bs4
import requests
import re # this lets us do regex replacements
import os
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
archive_url = "https://transcripts.foreverdreaming.org/viewforum.php?f=1524"
output_folder = "../transcriptFiles"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
}
# headers = {
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
# }
response = requests.get(archive_url, headers=headers)
print(response.status_code) # Returns 200 if successful
print(response.text)
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
# Replace with your actual URL
await page.goto(archive_url)
# Wait for the specific selector to ensure the page is loaded
await page.wait_for_selector("a.topictitle", timeout=15000)
html = await page.content()
await browser.close()
# Now you can parse with BeautifulSoup as usual
soup = BeautifulSoup(html, "html.parser")
links = soup.find_all("a", class_="topictitle")
for link in links:
print(link.get_text(), link.get('href'))
# Scraping Episodes
async def get_transcript_text(browser, url):
page = await browser.new_page()
await page.goto(url)
# Wait for the main post body to load (Forever Dreaming uses .postbody)
await page.wait_for_selector(".postbody", timeout=10000)
html = await page.content()
await page.close() # Close the tab, but keep the browser open
soup = BeautifulSoup(html, "html.parser")
# Forever Dreaming transcript text is inside div.content
content = soup.find("div", class_="content")
return content.get_text(separator="\n") if content else ""
# Launch the Scraper
async def corpus_build(archive_url, output_folder):
if not os.path.exists(output_folder):
os.makedirs(output_folder)
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
print(f"Fetching index page: {archive_url}")
await page.goto(archive_url)
await page.wait_for_selector("a.topictitle")
html = await page.content()
soup = BeautifulSoup(html, "html.parser")
# Find all transcript links
links = soup.find_all("a", class_="topictitle")
for index, link in enumerate(links):
raw_title = link.get_text().strip().replace("/", "-") # Here we replace `/` that could be a path separator
title = re.sub(r'[^\w\-]', '_', raw_title) # We're cleaning up the title with a regex,
# anything that isn't alphanumeric gets converted to an underscore.
relative_url = link.get('href').lstrip('.')
full_url = f"https://transcripts.foreverdreaming.org{relative_url}"
print(f"Scraping ({index+1}/{len(links)}): {title}")
try:
text = await get_transcript_text(browser, full_url)
# Write to file
file_path = os.path.join(output_folder, f"{title}.txt")
with open(file_path, "w", encoding="utf-8") as f:
f.write(text)
except Exception as e:
print(f"Failed to scrape {title}: {e}")
await browser.close()
print("Finished! Corpus saved.")
# Initiate the process by running corpus_build()
await corpus_build(
archive_url,
output_folder
)