Comments (4)
can you provide the whole output and input?
from scrapegraph-ai.
`
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
def get_graph_config():
return {
"llm": {
"api_key": openai_api_key,
"model": "gpt-4-turbo",
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"base_url": "http://localhost:11434",
},
}
def scrape_url(url):
headers = {
'User-Agent': 'Mozilla/5.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://www.google.com/',
'Connection': 'keep-alive',
}
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
return soup.get_text(separator=' ', strip=True)
except requests.RequestException as e:
return str(e)
def is_subpage_or_subdomain(source_url, url):
source_parsed = urlparse(source_url)
url_parsed = urlparse(url)
source_netloc = source_parsed.netloc.strip('www.')
url_netloc = url_parsed.netloc.strip('www.')
return (url_parsed.path.startswith(source_parsed.path) or url_netloc.endswith(source_netloc)) and (url_netloc != '')
def extract_emails_from_text(text):
email_pattern = r'\b(?:contact|info|sales|queries?|query)@[\w.-]+\b'
emails = re.findall(email_pattern, text)
return emails
client = OpenAI(api_key=openai_api_key)
def generate_email_text(scraped_data):
response = client.chat.completions.create(
model="gpt-4o",
max_tokens=1500,
messages=[
{"role": "system", "content": "role"},
{"role": "user", "content": role"}
]
)
email_text = response.choices[0].message.content
print("Email Text:", email_text)
return email_text
def send_email(subject, body, to_email):
sender_email = "[email protected]"
receiver_email = to_email
password = "password"
message = MIMEMultipart()
message["From"] = sender_email
message["To"] = receiver_email
message["Subject"] = subject
message.attach(MIMEText(body, "plain"))
with smtplib.SMTP('smtp.gmail.com', 587) as server:
server.starttls()
server.login(sender_email, password)
server.sendmail(sender_email, receiver_email, message.as_string())
def process_url(source_url):
graph_config = get_graph_config()
smart_scraper_graph = SmartScraperGraph(
prompt="prompt1.",
source=source_url,
config=graph_config
)
result = smart_scraper_graph.run()
print(result, "\n")
url_pattern = r'https://[^\s\'"\\]+'
urls = re.findall(url_pattern, str(result))
print(urls, "\n")
email_pattern = r'\b[\w.-]+@[\w.-]+\.[\w.-]+\b'
emails_list_1 = re.findall(email_pattern, str(result))
scraped_data = {}
if urls:
for url in urls:
if is_subpage_or_subdomain(source_url, url):
print(f"Scraping URL: {url}")
smart_scraper = SmartScraperGraph(
prompt="prompt2",
source=url,
config=graph_config
)
text_content = smart_scraper.run()
emails_from_text = extract_emails_from_text(text_content)
scraped_data[url] = {
'text_content': text_content,
'emails': emails_from_text
}
else:
print(f"Skipping URL: {url} (not a subpage or subdomain of {source_url})")
email = generate_email_text(scraped_data)
subject_pattern = r'Subject:\s*(.*)'
match = re.search(subject_pattern, email)
if match:
subject = match.group(1).strip()
email_body = email[match.end():].strip()
else:
subject = None
email_body = email.strip()
send_email(subject, email_body, "[email protected]")
name_pattern = r'www\.([a-zA-Z0-9-]+)\.[a-zA-Z]{2,3}(\.[a-zA-Z]{2})?'
match = re.search(name_pattern, url)
if match:
filename = match.group(1)
file_name = f"{filename}_{random.randint(1, 100)}.txt"
with open(file_name, 'w', encoding='utf-8') as f:
json.dump(email, f, indent=4, ensure_ascii=False)
print(f"Output saved to {file_name}")
def main(input_file):
with open(input_file, 'r') as file:
urls = file.readlines()
for url in urls:
url = url.strip()
if url:
process_url(url)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Scrape websites .')
parser.add_argument('--file', type=str, required=True, help='The text file containing URLs to scrape')
args = parser.parse_args()
main(args.file)`
Error
Traceback (most recent call last):
File "/home/user/scraper/ai.py", line 186, in
main(args.file)
File "/home/user/scraper/ai.py", line 178, in main
process_url(url)
File "/home/user/scraper/ai.py", line 128, in process_url
smart_scraper = SmartScraperGraph(
^^^^^^^^^^^^^^^^^^
File "/home/user/miniconda3/envs/scrapeai/lib/python3.11/site-packages/scrapegraphai/graphs/smart_scraper_graph.py", line 52, in init
super().init(prompt, config, source, schema)
File "/home/user/miniconda3/envs/scrapeai/lib/python3.11/site-packages/scrapegraphai/graphs/abstract_graph.py", line 73, in init
self.embedder_model = self._create_default_embedder(llm_config=config["llm"] ) if "embeddings" not in config else self._create_embedder(
^^^^^^^^^^^^^^^^^^^^^^
File "/home/user/miniconda3/envs/scrapeai/lib/python3.11/site-packages/scrapegraphai/graphs/abstract_graph.py", line 371, in _create_embedder
raise ValueError("Model provided by the configuration not supported")
ValueError: Model provided by the configuration not supported
from scrapegraph-ai.
Here is an other example
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
def get_graph_config():
return {
"llm": {
"api_key": openai_api_key,
"model": "gpt-3.5-turbo",
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"base_url": "http://localhost:11434",
},
}
client = OpenAI(api_key=openai_api_key)
def rewrite_faqs(scraped_data):
response = client.chat.completions.create(
model="gpt-4o",
max_tokens=1500,
messages=[
{"role": "system", "content": "You are a helpful assistant that rewrites FAQs in a clear and concise manner."},
{"role": "user", "content": f"Please rewrite the following FAQs:\n{scraped_data}"}
]
)
faqs = response.choices[0].message.content
print("Faq Text:", faqs)
return faqs
def scrape_and_save(urls):
graph_config = get_graph_config()
results = []
for url in urls:
try:
smart_scraper_graph = SmartScraperGraph(
prompt="Extract all FAQs from the provided URL.",
source=url,
config=graph_config
)
result = smart_scraper_graph.run()
rewritten_faqs = rewrite_faqs(result)
results.append(f"URL: {url}\nRewritten FAQs: {rewritten_faqs}\n")
except Exception as e:
results.append(f"URL: {url}\nError: {str(e)}\n")
with open("faq_result.txt", "w") as file:
file.writelines(results)
def main(input_file):
with open(input_file, "r") as file:
urls = [line.strip() for line in file.readlines()]
scrape_and_save(urls)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Scrape multiple websites from a text file for useful links, sublinks, and email addresses, and save the results.')
parser.add_argument('--file', type=str, required=True, help='The text file containing URLs to scrape')
args = parser.parse_args()
main(args.file)
`
### OUTPUT
URL: https://www.hekis.com/support/faq/faq-gq
Rewritten FAQs: ### FAQs
- What are the rules/regulations for importing a vehicle?
- Please consult with your country for export rules and pre-export inspections. You can find references to these authorities in our FAQ section βBasic Export Rules/Regulations per Country.β It's important to understand the requirements and responsibilities of importing a vehicle in your country before proceeding.
- Can you ship auto parts inside the vehicle I am buying?
- No, we cannot place anything inside the vehicle unless it is being shipped by container.
URL: https://www.hekis.com/support/faq/faq-vi
Error: Model provided by the configuration not supported
URL: https://www.hekis.com/support/faq/faq-bp
Error: Model provided by the configuration not supported
URL: https://www.hekis.com/support/faq/faq-bs
Error: Model provided by the configuration not supported
URL: https://www.hekis.com/support/faq/faq-doc
Error: Model provided by the configuration not supported
URL: https://www.hekis.com/support/faq/faq-rc
Error: Model provided by the configuration not supported
URL: https://www.hekis.com/support/faq/faq-cr
Error: Model provided by the configuration not supported
URL: https://www.hekis.com/support/faq/faq-terms
Error: Model provided by the configuration not supported
- No, we cannot place anything inside the vehicle unless it is being shipped by container.
from scrapegraph-ai.
Hey @HacanAle I have tried the following with no errors, it seems like your issue is related to the embedder model:
def get_graph_config():
return {
"llm": {
"api_key": openai_key,
"model": "gpt-3.5-turbo",
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"base_url": "http://localhost:11434",
},
"verbose": True,
"headless": False,
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
config = get_graph_config()
smart_scraper_graph1 = SmartScraperGraph(
prompt="List me all the projects with their description",
# also accepts a string with the already downloaded HTML code
source="https://perinim.github.io/projects/",
config=config,
)
result = smart_scraper_graph1.run()
smart_scraper_graph = SmartScraperGraph(
prompt="List me all the projects",
# also accepts a string with the already downloaded HTML code
source="https://perinim.github.io/projects/",
config=config,
)
result1 = smart_scraper_graph.run()
You can try installing the new version and see if you still got the problem. I have also tried it inside a for loop iterating over prompts and still got no errors.
Hope it helps
from scrapegraph-ai.
Related Issues (20)
- 'SmartScraperGraph' object has no attribute 'model_token' HOT 2
- Adding message parameter support for OpenAI models HOT 5
- Do we have a output parser to get a certain format output HOT 1
- No HTML body content found when trying FetchNode HOT 3
- Could it support other chinese ollama embedding models? HOT 7
- Switch between search engines
- How could I remove part of page content before sending to LLM? HOT 7
- BedRock Malformed input request: #/texts/0: expected maxLength: 2048, actual: 19882, please reformat your input and try agai HOT 5
- Not able to run Anthropic Claude models. HOT 4
- split unit testing from src
- Groq example results in context_length_exceeded error HOT 1
- 'Batchsize' config attribute does not change
- Result json truncated attachments for long attachment list
- ImportError: cannot import name 'Required' from 'typing' (/usr/local/lib/python3.11/typing.py) HOT 1
- Any way to pass html as it is to llm rather then cleaning/ remove html tags HOT 1
- Default Prompt template customization HOT 2
- 'SmartScraperGraph' object has no attribute 'model_token' HOT 1
- Add Vertex AI Integration HOT 1
- SearchGraph error while follwing the example
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
π Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. πππ
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google β€οΈ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from scrapegraph-ai.