import requests import htmlmin from bs4 import BeautifulSoup import mariadb import sys import random import urllib3 from datetime import datetime urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # Connect to MariaDB Platform try: conn = mariadb.connect( user="root", password="sailnow", host="localhost", port=3306, database="realty" ) except mariadb.Error as e: print(f"Error connecting to MariaDB Platform: {e}") sys.exit(1) # Get Pointer db = conn.cursor(buffered=True) headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"} #https://proxyscrape.com/free-proxy-list proxyrequest = requests.get(url='https://api.proxyscrape.com/?request=getproxies&proxytype=socks4&timeout=1000&country=CA') proxyrequest_format = proxyrequest.text.strip() proxyrequest_format = proxyrequest_format.replace("\r","") list_proxies = list(proxyrequest_format.split("\n")) db.execute("SELECT property_id FROM properties WHERE sales_history=1 ORDER BY RAND()") rows = db.fetchall() for row in rows: property_id = row[0] url = 'https://www.viewpoint.ca/property/'+str(row[0])+'/1' print(f"ID: {property_id}, URL: {url}") url = 'https://www.thewebpeople.ca' db.execute("SELECT last_crawled FROM properties where proeprty_id=%s", (property_id)) current_status = db.fetchone() if current_status: # not null print('Already downloaded') else: # is null proxy = random.choice (list_proxies) proxies = { "http": "socks4://"+proxy, "https": "socks4://"+proxy } try: r = requests.get(url=url, proxies=proxies, verify=False, headers=headers, timeout=5) print('response received') print(r.test) #minified = htmlmin.minify(r.text.decode("utf-8"), remove_empty_space=True) now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") #Html_file= open("properties/"+property_id+"-"+now+".html","w") #Html_file.write(minified) #Html_file.close()