Code Monkey home page Code Monkey logo

Comments (4)

VinciGit00 avatar VinciGit00 commented on July 1, 2024

can you provide the whole output and input?

from scrapegraph-ai.

HacanAle avatar HacanAle commented on July 1, 2024
`
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

def get_graph_config():
    return {
        "llm": {
            "api_key": openai_api_key,
            "model": "gpt-4-turbo",
        },
        "embeddings": {
            "model": "ollama/nomic-embed-text",
            "base_url": "http://localhost:11434",  
        },
    }

def scrape_url(url):
    headers = {
        'User-Agent': 'Mozilla/5.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Referer': 'https://www.google.com/',
        'Connection': 'keep-alive',
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup.get_text(separator=' ', strip=True)
    except requests.RequestException as e:
        return str(e)

def is_subpage_or_subdomain(source_url, url):
    source_parsed = urlparse(source_url)
    url_parsed = urlparse(url)
    
    source_netloc = source_parsed.netloc.strip('www.')
    url_netloc = url_parsed.netloc.strip('www.')
    
    return (url_parsed.path.startswith(source_parsed.path) or url_netloc.endswith(source_netloc)) and (url_netloc != '')

def extract_emails_from_text(text):
    email_pattern = r'\b(?:contact|info|sales|queries?|query)@[\w.-]+\b'
    emails = re.findall(email_pattern, text)
    return emails

client = OpenAI(api_key=openai_api_key)

def generate_email_text(scraped_data):
    response = client.chat.completions.create(
        model="gpt-4o",
        max_tokens=1500,
        messages=[
            {"role": "system", "content": "role"},
            {"role": "user", "content": role"}
        ]
    )
    
    email_text = response.choices[0].message.content

    print("Email Text:", email_text)
    return email_text

def send_email(subject, body, to_email):
    sender_email = "[email protected]"
    receiver_email = to_email
    password = "password"

    message = MIMEMultipart()
    message["From"] = sender_email
    message["To"] = receiver_email
    message["Subject"] = subject

    message.attach(MIMEText(body, "plain"))

    with smtplib.SMTP('smtp.gmail.com', 587) as server:
        server.starttls()
        server.login(sender_email, password)
        server.sendmail(sender_email, receiver_email, message.as_string())

def process_url(source_url):
    graph_config = get_graph_config()

    smart_scraper_graph = SmartScraperGraph(
        prompt="prompt1.",
        source=source_url,
        config=graph_config
    )
    
    result = smart_scraper_graph.run()
    
    print(result, "\n")
    url_pattern = r'https://[^\s\'"\\]+'
    urls = re.findall(url_pattern, str(result))
    print(urls, "\n")

    email_pattern = r'\b[\w.-]+@[\w.-]+\.[\w.-]+\b'
    emails_list_1 = re.findall(email_pattern, str(result))

    scraped_data = {}
    if urls:
        for url in urls:
            if is_subpage_or_subdomain(source_url, url):
                print(f"Scraping URL: {url}")
                smart_scraper = SmartScraperGraph(
                    prompt="prompt2",
                    source=url,
                    config=graph_config
                )
                
                text_content = smart_scraper.run()
                emails_from_text = extract_emails_from_text(text_content)
                scraped_data[url] = {
                    'text_content': text_content,
                    'emails': emails_from_text
                }
            else:
                print(f"Skipping URL: {url} (not a subpage or subdomain of {source_url})")

    email = generate_email_text(scraped_data)

    subject_pattern = r'Subject:\s*(.*)'
    match = re.search(subject_pattern, email)
    if match:
        subject = match.group(1).strip()
        email_body = email[match.end():].strip()
    else:
        subject = None
        email_body = email.strip()

    send_email(subject, email_body, "[email protected]")

    name_pattern = r'www\.([a-zA-Z0-9-]+)\.[a-zA-Z]{2,3}(\.[a-zA-Z]{2})?'
    match = re.search(name_pattern, url)
    if match:
        filename = match.group(1)
        
    file_name = f"{filename}_{random.randint(1, 100)}.txt"
    with open(file_name, 'w', encoding='utf-8') as f:
        json.dump(email, f, indent=4, ensure_ascii=False)

    print(f"Output saved to {file_name}")

def main(input_file):
    with open(input_file, 'r') as file:
        urls = file.readlines()

    for url in urls:
        url = url.strip()
        if url:
            process_url(url)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Scrape websites .')
    parser.add_argument('--file', type=str, required=True, help='The text file containing URLs to scrape')

    args = parser.parse_args()
    main(args.file)`

Error

Traceback (most recent call last):
File "/home/user/scraper/ai.py", line 186, in
main(args.file)
File "/home/user/scraper/ai.py", line 178, in main
process_url(url)
File "/home/user/scraper/ai.py", line 128, in process_url
smart_scraper = SmartScraperGraph(
^^^^^^^^^^^^^^^^^^
File "/home/user/miniconda3/envs/scrapeai/lib/python3.11/site-packages/scrapegraphai/graphs/smart_scraper_graph.py", line 52, in init
super().init(prompt, config, source, schema)
File "/home/user/miniconda3/envs/scrapeai/lib/python3.11/site-packages/scrapegraphai/graphs/abstract_graph.py", line 73, in init
self.embedder_model = self._create_default_embedder(llm_config=config["llm"] ) if "embeddings" not in config else self._create_embedder(
^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/miniconda3/envs/scrapeai/lib/python3.11/site-packages/scrapegraphai/graphs/abstract_graph.py", line 371, in _create_embedder
raise ValueError("Model provided by the configuration not supported")
ValueError: Model provided by the configuration not supported

from scrapegraph-ai.

HacanAle avatar HacanAle commented on July 1, 2024

Here is an other example

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")



def get_graph_config():
    return {
        "llm": {
            "api_key": openai_api_key,
            "model": "gpt-3.5-turbo",
        },
        "embeddings": {
            "model": "ollama/nomic-embed-text",
            "base_url": "http://localhost:11434",  
        },
    }
client = OpenAI(api_key=openai_api_key)
def rewrite_faqs(scraped_data):
    response = client.chat.completions.create(
    model="gpt-4o",
    max_tokens=1500,
    messages=[
        {"role": "system", "content": "You are a helpful assistant that rewrites FAQs in a clear and concise manner."},
        {"role": "user", "content": f"Please rewrite the following FAQs:\n{scraped_data}"}
    ]
    )
    
    faqs = response.choices[0].message.content

    print("Faq Text:", faqs)
    return faqs


def scrape_and_save(urls):
    graph_config = get_graph_config()
    
    results = []
    for url in urls:
        try:
            smart_scraper_graph = SmartScraperGraph(
                prompt="Extract all FAQs from the provided URL.",
                source=url,
                config=graph_config
            )
            result = smart_scraper_graph.run()
            rewritten_faqs = rewrite_faqs(result)
            results.append(f"URL: {url}\nRewritten FAQs: {rewritten_faqs}\n")
        except Exception as e:
            results.append(f"URL: {url}\nError: {str(e)}\n")

    with open("faq_result.txt", "w") as file:
        file.writelines(results)

def main(input_file):
    with open(input_file, "r") as file:
        urls = [line.strip() for line in file.readlines()]
    scrape_and_save(urls)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Scrape multiple websites from a text file for useful links, sublinks, and email addresses, and save the results.')
    parser.add_argument('--file', type=str, required=True, help='The text file containing URLs to scrape')

    args = parser.parse_args()
    main(args.file)
`

### OUTPUT

URL: https://www.hekis.com/support/faq/faq-gq
Rewritten FAQs: ### FAQs

  1. What are the rules/regulations for importing a vehicle?
    • Please consult with your country for export rules and pre-export inspections. You can find references to these authorities in our FAQ section β€œBasic Export Rules/Regulations per Country.” It's important to understand the requirements and responsibilities of importing a vehicle in your country before proceeding.
  2. Can you ship auto parts inside the vehicle I am buying?

from scrapegraph-ai.

PeriniM avatar PeriniM commented on July 1, 2024

Hey @HacanAle I have tried the following with no errors, it seems like your issue is related to the embedder model:

def get_graph_config():
    return {
        "llm": {
            "api_key": openai_key,
            "model": "gpt-3.5-turbo",
        },
        "embeddings": {
            "model": "ollama/nomic-embed-text",
            "base_url": "http://localhost:11434",  
        },
        "verbose": True,
        "headless": False,
    }

# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************

config = get_graph_config()

smart_scraper_graph1 = SmartScraperGraph(
    prompt="List me all the projects with their description",
    # also accepts a string with the already downloaded HTML code
    source="https://perinim.github.io/projects/",
    config=config,
)

result = smart_scraper_graph1.run()

smart_scraper_graph = SmartScraperGraph(
    prompt="List me all the projects",
    # also accepts a string with the already downloaded HTML code
    source="https://perinim.github.io/projects/",
    config=config,
)

result1 = smart_scraper_graph.run()

You can try installing the new version and see if you still got the problem. I have also tried it inside a for loop iterating over prompts and still got no errors.

Hope it helps

from scrapegraph-ai.

Related Issues (20)

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    πŸ–– Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. πŸ“ŠπŸ“ˆπŸŽ‰

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❀️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.