Added Python scripts for writing to file and web scraping

https://sebhastian.com/no-module-named-requests/
apmuthu · Sep 6, 2024 · 1f85281 · 1f85281
1 parent f44e7f0
commit 1f85281
Show file tree

Hide file tree

Showing 7 changed files with 256 additions and 184 deletions.
diff --git a/Python/README.md b/Python/README.md
@@ -0,0 +1,10 @@
+## Python 3 constructs on Windows 11
+* The version of Python used is 3.12.5
+* Installed in `C:\Python312`
+* Add the modules in `C:\Python312\Scripts`  using `pip3 install <package name>`
+
+### Web Scraping
+* `writing.py` - Write to file
+* `pysave.py` - Extract contents from awebpage and write to file / parse text
+* `WebScrapeChatGPT.py` - Webscrape and parse content using `requests` and `BeautifulSoup4` packages
+* `html.zip` - Sample html.txt file scarped frrom a Web URL and zipped
diff --git a/Python/WebScrapeChatGPT.py b/Python/WebScrapeChatGPT.py
@@ -0,0 +1,44 @@
+# pip install requests beautifulsoup4
+
+"""
+# C:\Python312\Scripts>pip3 install beautifulsoup4
+Defaulting to user installation because normal site-packages is not writeable
+Collecting beautifulsoup4
+  Downloading beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
+Collecting soupsieve>1.2 (from beautifulsoup4)
+  Downloading soupsieve-2.6-py3-none-any.whl.metadata (4.6 kB)
+Downloading beautifulsoup4-4.12.3-py3-none-any.whl (147 kB)
+Downloading soupsieve-2.6-py3-none-any.whl (36 kB)
+Installing collected packages: soupsieve, beautifulsoup4
+Successfully installed beautifulsoup4-4.12.3 soupsieve-2.6
+
+Use the following Python code:
+
+python
+"""
+
+import requests
+from bs4 import BeautifulSoup
+
+# URL of the web page you want to extract
+url = 'https://google.com'  # Replace with the URL of your choice
+
+# Fetch the content from the URL
+response = requests.get(url)
+
+# Check if the request was successful
+if response.status_code == 200:
+    # Parse the content with BeautifulSoup
+    soup = BeautifulSoup(response.text, 'html.parser')
+
+    # Extract and display the title of the web page
+    title = soup.title.string
+    print('Title of the page:', title)
+
+    # Example: Extract all paragraph text
+    paragraphs = soup.find_all('p')
+    for para in paragraphs:
+        print(para.get_text())
+else:
+    print('Failed to retrieve the web page. Status code:', response.status_code)
+
diff --git a/Python/html.zip b/Python/html.zip
diff --git a/Python/pysave.py b/Python/pysave.py
@@ -0,0 +1,57 @@
+# https://sebhastian.com/no-module-named-requests/
+# in cmd prompt
+# cd D:\Python311\Scripts
+# pip3 install requests
+#
+# C:\Python312\Scripts>pip3 install requests
+'''
+Collecting requests
+  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
+Collecting charset-normalizer<4,>=2 (from requests)
+  Downloading charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl.metadata (34 kB)
+Collecting idna<4,>=2.5 (from requests)
+  Downloading idna-3.8-py3-none-any.whl.metadata (9.9 kB)
+Collecting urllib3<3,>=1.21.1 (from requests)
+  Downloading urllib3-2.2.2-py3-none-any.whl.metadata (6.4 kB)
+Collecting certifi>=2017.4.17 (from requests)
+  Downloading certifi-2024.8.30-py3-none-any.whl.metadata (2.2 kB)
+Downloading requests-2.32.3-py3-none-any.whl (64 kB)
+Downloading certifi-2024.8.30-py3-none-any.whl (167 kB)
+Downloading charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl (100 kB)
+Downloading idna-3.8-py3-none-any.whl (66 kB)
+Downloading urllib3-2.2.2-py3-none-any.whl (121 kB)
+Installing collected packages: urllib3, idna, charset-normalizer, certifi, requests
+Successfully installed certifi-2024.8.30 charset-normalizer-3.3.2 idna-3.8 requests-2.32.3 urllib3-2.2.2
+'''
+
+
+import requests
+
+r = requests.get("https://example.com/")
+print(r.status_code)
+# t=open('html.zip', 'wb')
+
+# t=open('html.txt', 'wb')
+# t.write(r.content)
+# t.close()
+
+# from pathlib import Path
+# contents = Path("html.txt").read_text()
+
+contents = repr(r.content)
+parts = contents.split('<p class="has-text-align-center">', 2)
+datestr = parts[1].split(' ', 2)
+day = datestr[0]
+month = datestr[1]
+linkstr = datestr[2].split('"', 2)
+link=linkstr[1].split('/view', 1)
+url = link[0]
+print(day, ' ', month, ' ', url)
+
+
+
+
+
+
+
+
diff --git a/Python/writing.py b/Python/writing.py
@@ -0,0 +1,15 @@
+import math
+t=open('page.txt', 'w')
+t.write('Hello. Okay.')
+t.close()
+
+t=open('page.txt', 'a')
+t.write("\nValue of Pi is ")
+t.write(repr(math.pi)) # repr converts float to string
+pi=math.pi
+t.write("\nValue of formatted Pi is: {}".format(pi))
+print("\nWriting to file now.")
+t.close();
+
+
+
diff --git a/README.md b/README.md
@@ -71,6 +71,8 @@ pubip=`wget -qO- http://www.apmuthu.com/ip.php`
 * `TandC.php` - Place defined constants in a PHP String
 * `mask_download_url.php` - Mask the real URL of a downloadable file in php
 
+## [Python scripts](Python)
+
 ## PDF Scripts
 * `images_to_pdf.php` - Join images in a folder into a single PDF file using FPDF library
 * `pdfoverlay` - Generate an overlaid PDF populating a template PDF file using data from a database