producer-raw-recipies.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. from time import sleep
  2. import requests
  3. from bs4 import BeautifulSoup
  4. from kafka import KafkaProducer
  5. def publish_message(producer_instance, topic_name, key, value):
  6. try:
  7. key_bytes = bytes(key, encoding='utf-8')
  8. value_bytes = bytes(value, encoding='utf-8')
  9. producer_instance.send(topic_name, key=key_bytes, value=value_bytes)
  10. producer_instance.flush()
  11. print('Message published successfully.')
  12. except Exception as ex:
  13. print('Exception in publishing message')
  14. print(str(ex))
  15. def connect_kafka_producer():
  16. _producer = None
  17. try:
  18. _producer = KafkaProducer(bootstrap_servers=['localhost:9092'], api_version=(0, 10))
  19. except Exception as ex:
  20. print('Exception while connecting Kafka')
  21. print(str(ex))
  22. finally:
  23. return _producer
  24. def fetch_raw(recipe_url):
  25. html = None
  26. print('Processing..{}'.format(recipe_url))
  27. try:
  28. r = requests.get(recipe_url, headers=headers)
  29. if r.status_code == 200:
  30. html = r.text
  31. except Exception as ex:
  32. print('Exception while accessing raw html')
  33. print(str(ex))
  34. finally:
  35. return html.strip()
  36. def get_recipes():
  37. recipies = []
  38. salad_url = 'https://www.allrecipes.com/recipes/96/salad/'
  39. url = 'https://www.allrecipes.com/recipes/96/salad/'
  40. print('Accessing list')
  41. try:
  42. r = requests.get(url, headers=headers)
  43. if r.status_code == 200:
  44. html = r.text
  45. soup = BeautifulSoup(html, 'lxml')
  46. links = soup.select('.fixed-recipe-card__h3 a')
  47. idx = 0
  48. for link in links:
  49. sleep(2)
  50. recipe = fetch_raw(link['href'])
  51. recipies.append(recipe)
  52. idx += 1
  53. except Exception as ex:
  54. print('Exception in get_recipes')
  55. print(str(ex))
  56. finally:
  57. return recipies
  58. if __name__ == '__main__':
  59. headers = {
  60. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
  61. 'Pragma': 'no-cache'
  62. }
  63. all_recipes = get_recipes()
  64. if len(all_recipes) > 0:
  65. kafka_producer = connect_kafka_producer()
  66. for recipe in all_recipes:
  67. publish_message(kafka_producer, 'raw_recipes', 'raw', recipe.strip())
  68. if kafka_producer is not None:
  69. kafka_producer.close()