-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added Python scripts for writing to file and web scraping
- Loading branch information
Showing
7 changed files
with
256 additions
and
184 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
## Python 3 constructs on Windows 11 | ||
* The version of Python used is 3.12.5 | ||
* Installed in `C:\Python312` | ||
* Add the modules in `C:\Python312\Scripts` using `pip3 install <package name>` | ||
|
||
### Web Scraping | ||
* `writing.py` - Write to file | ||
* `pysave.py` - Extract contents from awebpage and write to file / parse text | ||
* `WebScrapeChatGPT.py` - Webscrape and parse content using `requests` and `BeautifulSoup4` packages | ||
* `html.zip` - Sample html.txt file scarped frrom a Web URL and zipped |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
# pip install requests beautifulsoup4 | ||
|
||
""" | ||
# C:\Python312\Scripts>pip3 install beautifulsoup4 | ||
Defaulting to user installation because normal site-packages is not writeable | ||
Collecting beautifulsoup4 | ||
Downloading beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB) | ||
Collecting soupsieve>1.2 (from beautifulsoup4) | ||
Downloading soupsieve-2.6-py3-none-any.whl.metadata (4.6 kB) | ||
Downloading beautifulsoup4-4.12.3-py3-none-any.whl (147 kB) | ||
Downloading soupsieve-2.6-py3-none-any.whl (36 kB) | ||
Installing collected packages: soupsieve, beautifulsoup4 | ||
Successfully installed beautifulsoup4-4.12.3 soupsieve-2.6 | ||
Use the following Python code: | ||
python | ||
""" | ||
|
||
import requests | ||
from bs4 import BeautifulSoup | ||
|
||
# URL of the web page you want to extract | ||
url = 'https://google.com' # Replace with the URL of your choice | ||
|
||
# Fetch the content from the URL | ||
response = requests.get(url) | ||
|
||
# Check if the request was successful | ||
if response.status_code == 200: | ||
# Parse the content with BeautifulSoup | ||
soup = BeautifulSoup(response.text, 'html.parser') | ||
|
||
# Extract and display the title of the web page | ||
title = soup.title.string | ||
print('Title of the page:', title) | ||
|
||
# Example: Extract all paragraph text | ||
paragraphs = soup.find_all('p') | ||
for para in paragraphs: | ||
print(para.get_text()) | ||
else: | ||
print('Failed to retrieve the web page. Status code:', response.status_code) | ||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
# https://sebhastian.com/no-module-named-requests/ | ||
# in cmd prompt | ||
# cd D:\Python311\Scripts | ||
# pip3 install requests | ||
# | ||
# C:\Python312\Scripts>pip3 install requests | ||
''' | ||
Collecting requests | ||
Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB) | ||
Collecting charset-normalizer<4,>=2 (from requests) | ||
Downloading charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl.metadata (34 kB) | ||
Collecting idna<4,>=2.5 (from requests) | ||
Downloading idna-3.8-py3-none-any.whl.metadata (9.9 kB) | ||
Collecting urllib3<3,>=1.21.1 (from requests) | ||
Downloading urllib3-2.2.2-py3-none-any.whl.metadata (6.4 kB) | ||
Collecting certifi>=2017.4.17 (from requests) | ||
Downloading certifi-2024.8.30-py3-none-any.whl.metadata (2.2 kB) | ||
Downloading requests-2.32.3-py3-none-any.whl (64 kB) | ||
Downloading certifi-2024.8.30-py3-none-any.whl (167 kB) | ||
Downloading charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl (100 kB) | ||
Downloading idna-3.8-py3-none-any.whl (66 kB) | ||
Downloading urllib3-2.2.2-py3-none-any.whl (121 kB) | ||
Installing collected packages: urllib3, idna, charset-normalizer, certifi, requests | ||
Successfully installed certifi-2024.8.30 charset-normalizer-3.3.2 idna-3.8 requests-2.32.3 urllib3-2.2.2 | ||
''' | ||
|
||
|
||
import requests | ||
|
||
r = requests.get("https://example.com/") | ||
print(r.status_code) | ||
# t=open('html.zip', 'wb') | ||
|
||
# t=open('html.txt', 'wb') | ||
# t.write(r.content) | ||
# t.close() | ||
|
||
# from pathlib import Path | ||
# contents = Path("html.txt").read_text() | ||
|
||
contents = repr(r.content) | ||
parts = contents.split('<p class="has-text-align-center">', 2) | ||
datestr = parts[1].split(' ', 2) | ||
day = datestr[0] | ||
month = datestr[1] | ||
linkstr = datestr[2].split('"', 2) | ||
link=linkstr[1].split('/view', 1) | ||
url = link[0] | ||
print(day, ' ', month, ' ', url) | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
import math | ||
t=open('page.txt', 'w') | ||
t.write('Hello. Okay.') | ||
t.close() | ||
|
||
t=open('page.txt', 'a') | ||
t.write("\nValue of Pi is ") | ||
t.write(repr(math.pi)) # repr converts float to string | ||
pi=math.pi | ||
t.write("\nValue of formatted Pi is: {}".format(pi)) | ||
print("\nWriting to file now.") | ||
t.close(); | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.