git.net

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

NotADirectoryError: [Errno 20] Not a directory


Note: Beginner

I'm trying to create an html parser that will go through a folder and all
its subfolders and export all html files without any html tags, in file
formats CSV and TXT with each html labeled with the title of the web page
in a new CSV and TXT.

However I keep getting an error saying:




*"Traceback (most recent call last):  File
"/Users/username/Documents/htmlparser/parser10.py", line 59, in <module>
for subentry in os.scandir(entry.path):NotADirectoryError: [Errno 20] Not a
directory: '/Users/username/site/.DS_Store'"*

Here's what I've done so far (I have bolded line 59):

"
import bs4 as bs
import csv
import glob
import os
import re

directory = "/Users/username/site"


with os.scandir(directory) as it:
    for entry in it:
        if ".html" in entry.name or
re.match(r'.*?(?:\.html?$|\.html?\?.*)', entry.name) is not None:
            print(entry.name, entry.path)
            my_data = (entry)
            listofp = []
            soup = bs.BeautifulSoup(open(my_data, "r").read())
            for paragraph in soup.find_all('p'):
                listofp.append(paragraph.string)
                title = soup.title.string
                leftitle = [title]
                listception = [leftitle]
            for moreshit in soup.find_all('h1', 'h2', 'h3', 'h4', 'h5'):
                listception.append([str(moreshit.text)])
            for paragraph in soup.find_all('p'):
                listception.append([str(paragraph.text)])
            for elements in soup.find_all('li', 'td', 'div', 'span'):
                listception.append([str(elements.text)])
            for evenmoreshit in soup.find_all('h6', 'a'):
                listception.append([str(evenmoreshit.text)])
            num = 0
            with open('export/' + title + '.csv', 'w') as csv_file:
                writer = csv.writer(csv_file, delimiter=',')
                writer.writerows(listception)

            file_path = os.path.join(directory, entry)
            text = open(file_path, mode='r').read()
            results = str(listception).strip('[]')
            results = results.replace("[", " ")
            results = results.replace("]", " ")
            results = results.replace("""\n""", " ")
            results_dir = "/Users/username/site/done"
            results_file = title + '.txt'
            file_path = os.path.join(results_dir, results_file)
            open(file_path, mode='w', encoding='UTF-8').write(results)
            continue


*        for subentry in os.scandir(entry.path):*            for file in
os.scandir(subentry.path):
                if ".html" in entry.name or
re.match(r'.*?(?:\.html?$|\.html?\?.*)', entry.name) is not None:
                    print(entry.name, entry.path)
                    my_data = (entry)
                    listofp = []
                    soup = bs.BeautifulSoup(open(my_data, "r").read())
                    for paragraph in soup.find_all('p'):
                        listofp.append(paragraph.string)
                        title = soup.title.string
                        leftitle = [title]
                        listception = [leftitle]
                    for moreshit in soup.find_all('h1', 'h2', 'h3', 'h4',
'h5'):
                        listception.append([str(moreshit.text)])
                    for paragraph in soup.find_all('p'):
                        listception.append([str(paragraph.text)])
                    for elements in soup.find_all('li', 'td', 'div',
'span'):
                        listception.append([str(elements.text)])
                    for evenmoreshit in soup.find_all('h6', 'a'):
                        listception.append([str(evenmoreshit.text)])
                    num = 0
                    with open('export/' + title + '.csv', 'w') as csv_file:
                        writer = csv.writer(csv_file, delimiter=',')
                        writer.writerows(listception)

                    file_path = os.path.join(directory, entry)
                    text = open(file_path, mode='r').read()
                    results = str(listception).strip('[]')
                    results = results.replace("[", " ")
                    results = results.replace("]", " ")
                    results = results.replace("""\n""", " ")
                    results_dir = "/Users/username/site/done"
                    results_file = title + '.txt'
                    file_path = os.path.join(results_dir, results_file)
                    open(file_path, mode='w',
encoding='UTF-8').write(results)
                    continue


Would love any help whatsoever or any suggestions of any kind. Thank you
very much!