diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml new file mode 100644 index 000000000..195504389 --- /dev/null +++ b/.github/workflows/CI.yml @@ -0,0 +1,79 @@ +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. +# This workflow will build a Java project with Gradle and cache/restore any dependencies to improve the workflow execution time +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-java-with-gradle + +name: Java CI with Gradle + +on: + push: + branches: [ "master" ] + paths-ignore: + - 'src/python/**' + - '.github/workflows/lint.yml' + pull_request: + branches: [ "master" ] + +jobs: + build: + + runs-on: ubuntu-latest +# strategy: +# matrix: +# os: [windows-latest,ubunto-latest,macos-latest] + permissions: + contents: read + + steps: + - uses: actions/checkout@v4 + - name: Set up JDK 1.8 + uses: actions/setup-java@v1 + with: + java-version: 1.8 + - name: Grant execute permission for gradlew + run: chmod +x gradlew + + - name: Build with Gradle + run: ./gradlew build + + + docker: + needs: build + runs-on: ubuntu-latest + + steps: + + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + + - name: Log in to Docker Hub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Build and push Docker image + uses: docker/build-push-action@v3 + with: + context: . + push: true + tags: anoshrz/java_project:latest + + - name: Log out from Docker Hub + run: docker logout + # NOTE: The Gradle Wrapper is the default and recommended way to run Gradle (https://docs.gradle.org/current/userguide/gradle_wrapper.html). + # If your project does not have the Gradle Wrapper configured, you can use the following configuration to run Gradle with a specified version. + # + # - name: Setup Gradle + # uses: gradle/actions/setup-gradle@417ae3ccd767c252f5661f1ace9f835f9654f2b5 # v3.1.0 + # with: + # gradle-version: '8.5' + # + # - name: Build with Gradle 8.5 + # run: gradle build + diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index a429a2a98..000000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,38 +0,0 @@ -# This workflow will build a Java project with Gradle -# For more information see: https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-gradle - -name: Java CI with Gradle - -on: - push: - branches: [ master ] - pull_request: - branches: [ master ] - -jobs: - build-java: - - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v2 - - - name: Set up JDK 1.8 - uses: actions/setup-java@v1 - with: - java-version: 1.8 - - - name: Grant execute permission for gradlew - run: chmod +x gradlew - - - name: Build with Gradle - run: ./gradlew build - - - name: Build and Push Docker Image - uses: mr-smithers-excellent/docker-build-push@v4 - with: - image: nanajanashia/demo-app - registry: docker.io - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_PASSWORD }} - diff --git a/.github/workflows/greetings b/.github/workflows/greetings new file mode 100644 index 000000000..3f892a703 --- /dev/null +++ b/.github/workflows/greetings @@ -0,0 +1,13 @@ +name: Greetings + +on: [pull_request, issues] + +jobs: + greeting: + runs-on: ubuntu-latest + steps: + - uses: actions/first-interaction@v1 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + issue-message: 'Welcome on your first issue' + pr-message: 'Message that will be displayed on users first pr' diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 000000000..048a66bb5 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,12 @@ +name: Lint and Format Code +on: + push: + paths: + - 'src/python/**' + - '.github/workflows/lint.yml' +jobs: + lint-and-format: + uses: anosh-ar/FAPS_Github_Action/.github/workflows/lint_reusable.yml@master + secrets: + SLACK_WEBHOOK_URL: ${{secrets.SLACK_WEBHOOK_URL}} + diff --git a/.github/workflows/lint_reusable.yml b/.github/workflows/lint_reusable.yml new file mode 100644 index 000000000..47b4e1bb2 --- /dev/null +++ b/.github/workflows/lint_reusable.yml @@ -0,0 +1,41 @@ +name: Reusable Linting to Slack +on: + workflow_call: + secrets: + SLACK_WEBHOOK_URL: + required: true + +jobs: + lint-and-format: + runs-on: arc-runner-set + steps: + - name: Checkout code + uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.8' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install black pylint + - name: Format code with Black + run: black . + - name: Run pylint + id: lint + continue-on-error: true + run: pylint ./**/**.py > pylint_report.txt + + - name: Send code rating to slack + run: | + RATING=$(grep -oP "Your code has been rated at \K[0-9\.]+/[0-9\.]+" pylint_report.txt) + curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"Your code rating of your recent push is: ${RATING}\"}" ${{ secrets.SLACK_WEBHOOK_URL }} + + - name: Check for critical pylint errors + id: pylint_check + run: | + if grep -E "E[0-9]+" pylint_report.txt; then + echo "Critical pylint errors found in the code. Please check pylint_report.txt for details." + ERROR_MSG=$(grep -E "E[0-9]+" pylint_report.txt) + curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"Critical linting errors found in the code. Please check the logs for details:\n\`\`\`$ERROR_MSG\`\`\`\"}" ${{ secrets.SLACK_WEBHOOK_URL }} + fi diff --git a/Dockerfile b/Dockerfile index d2b1dc574..a382a2311 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ FROM openjdk:8-jre-alpine EXPOSE 8080 -COPY ./build/libs/my-app-1.0-SNAPSHOT.jar /usr/app/ +#COPY ./build/libs/my-app-1.0-SNAPSHOT.jar /usr/app/ WORKDIR /usr/app ENTRYPOINT ["java", "-jar", "my-app-1.0-SNAPSHOT.jar"] diff --git a/src/python/preprocessing.py b/src/python/preprocessing.py new file mode 100644 index 000000000..bd16bb21b --- /dev/null +++ b/src/python/preprocessing.py @@ -0,0 +1,34 @@ +import pandas as pd + + +def merge_data(): + + # Paths to the CSV files + csv_file_1 = "sources/qa.csv" # Answer + csv_file_2 = "sources/cncf_stackoverflow_qas.csv" # answer + + # Read the CSV files + df1 = pd.read_csv(csv_file_1) + df2 = pd.read_csv(csv_file_2) + + # Select and rename the columns of interest from the first file + df1_selected = df1[["Question", "Answer", "Project"]] + + # Select and rename the columns of interest from the second file + + df2_selected = df2[["question", "answer", "tag"]].rename( + columns={"question": "Question", "answer": "Answer", "tag": "Project"} + ) + + # Concatenate the selected and renamed columns + merged_df = pd.concat([df1_selected, df2_selected]) + + # Save the merged DataFrame to a new CSV file + merged_df.to_csv("merged_qas.csv", index=False) + + print("Columns merged and saved successfully!") + + +if __name__ == "__main__": + merge_data() + print(test_wrong_value) diff --git a/src/python/stackoverflow_extractor.py b/src/python/stackoverflow_extractor.py new file mode 100644 index 000000000..66f48543e --- /dev/null +++ b/src/python/stackoverflow_extractor.py @@ -0,0 +1,314 @@ +import yaml +import requests +import pandas as pd +from bs4 import BeautifulSoup +import time +import json +import os +import sys +from datetime import datetime, timedelta + +API_KEY = '' # Replace with your actual API key of stackexchange +REQUEST_DELAY = 0 # Number of seconds to wait between requests +PROGRESS_FILE = 'sources/stackoverflow_progress.json' +CSV_FILE = 'sources/cncf_stackoverflow_qas.csv' +PROCESSED_IDS_FILE = 'sources/processed_question_ids.json' +TAGS_FILE = 'sources/tags.json' +TAGS_UPDATE_INTERVAL = 7 # Number of days between tag updates +DAILY_REQUEST_LIMIT = 9000 + + +def fetch_with_backoff(api_url, params): + """Fetch data from the API with exponential backoff for rate limiting. + + Args: + api_url (str): The API endpoint URL. + params (dict): Dictionary of query parameters for the API request. + + Returns: + dict: The JSON response data from the API if successful. + None: If the API request fails. + """ + while True: + # print(f"Fetching data with params: {params}") + response = requests.get(api_url, params=params) + if response.status_code == 200: + return response.json() + elif response.status_code == 429: + print("Rate limit exceeded. Waiting for retry...") + retry_after = int(response.headers.get('retry-after', REQUEST_DELAY)) + sys.exit() + else: + print(f"Failed to fetch data: {response.status_code} - {response.text}") + sys.exit() + return None + +def qa_extractor(request_count, tag, start_page, page_size=100,): + """Fetch questions from StackOverflow for a given tag. + + Args: + request_count (int): Current count of API requests made. + tag (str): The tag to search for on StackOverflow. + start_page (int): The starting page number for the API request. + page_size (int, optional): Number of results per page. Defaults to 100. + + + Returns: + int: Updated request count after fetching questions. + """ + + api_url = "https://api.stackexchange.com/2.3/search/advanced" + questions = [] + + # Load processed question IDs + processed_question_ids = load_processed_question_ids() + + while True: + if request_count >= DAILY_REQUEST_LIMIT: + break + + params = { + 'page': start_page, + 'pagesize': page_size, + 'order': 'desc', + 'sort': 'activity', + 'answers': 1, + 'tagged': tag, + 'site': 'stackoverflow', + 'filter': 'withbody', # Ensuring the 'body' field is included + 'key': API_KEY + } + + response_data = fetch_with_backoff(api_url, params) + request_count += 1 + if not response_data or not response_data['items']: + save_progress(tag, "null") + break + QA_list = [] + if response_data: + questions.extend(response_data['items']) + + for question in response_data['items']: + question_id = question['question_id'] + if question_id in processed_question_ids: + continue + if question['answer_count'] > 0: + question_text = remove_html_tags(question['body']) + request_count += 1 + + answers = fetch_answers(question_id) + # formatted_answers = [] + + for count, answer in enumerate(answers, start=1): + if count > 3: + break + if answer['score'] < 0: + continue + answer_text = remove_html_tags(answer['body']) + # formatted_answers.append(f"{count}. {answer_text}") + + QA_list.append({ + "question": question_text, + # "answer": "\n".join(formatted_answers), + "answer": answer_text, + "tag": tag, + }) + + # Add question ID to the set of processed IDs + processed_question_ids.add(question_id) + + has_more = response_data.get('has_more', False) + if not has_more: + save_progress(tag, "finished") + break + + print(f"Fetched {len(response_data['items'])} questions from page {start_page} for tag '{tag}'. Total so far: {len(questions)}") + save_to_csv(QA_list, CSV_FILE) + save_processed_question_ids(processed_question_ids) + start_page += 1 + save_progress(tag, start_page) + time.sleep(REQUEST_DELAY) # Add delay between requests to avoid rate limiting + else: + break + if request_count >= DAILY_REQUEST_LIMIT: + # print(f"Request count is: {request_count}") + break + + print(f"Request count for question is: {request_count}") + return request_count + +def fetch_answers(question_id): + """Fetch answers for a specific question from StackOverflow. + + Args: + question_id (int): The ID of the question to fetch answers for. + + Returns: + list: List of answer items if successful, otherwise an empty list. + """ + api_url = f"https://api.stackexchange.com/2.3/questions/{question_id}/answers" + params = { + 'order': 'desc', + 'sort': 'votes', + 'site': 'stackoverflow', + 'filter': 'withbody', # Ensuring the 'body' field is included + 'key': API_KEY + } + + response_data = fetch_with_backoff(api_url, params) + return response_data['items'] if response_data else [] + +def remove_html_tags(text): + """Remove HTML tags from a given text. + + Args: + text (str): The HTML text to be processed. + + Returns: + str: The text with HTML tags removed. + """ + soup = BeautifulSoup(text, "html.parser") + return soup.get_text() + +def extract_all_projects(tags, request_count): + """Extract QA pairs for multiple tags. + + Args: + tags (list): List of tags to process. + request_count (int): Initial count of API requests made. + """ + progress = load_progress() + all_tags_done = True # Flag to check if all tags are done + for tag in tags: + if progress.get(tag) == "null" or progress.get(tag) == "finished": + continue + else: + all_tags_done = False # Found a tag that needs processing + start_page = progress.get(tag, 1) + + request_count = qa_extractor(request_count, tag, start_page=start_page) + if request_count >= DAILY_REQUEST_LIMIT: + break + if all_tags_done: + print("We have reached all question-answer data from StackOverflow.") + +def save_to_csv(data, filename): + """Save extracted data to a CSV file. + + Args: + data (list): List of dictionaries containing QA data. + filename (str): The filename for the CSV file. + """ + if os.path.exists(filename) and os.path.getsize(filename) > 0: + try: + df = pd.read_csv(filename) + df = pd.concat([df, pd.DataFrame(data)], ignore_index=True) + except pd.errors.EmptyDataError: + df = pd.DataFrame(data) + else: + df = pd.DataFrame(data) + df.to_csv(filename, index=False) + # print(f"Data saved to {filename}") + +def load_progress(): + """Load progress data from file. + + Returns: + dict: Dictionary containing progress data. + """ + try: + with open(PROGRESS_FILE, 'r') as f: + data = json.load(f) + return data + except FileNotFoundError: + print(f"File {PROGRESS_FILE} not found.") + return {} + except json.JSONDecodeError: + print(f"Error decoding JSON data in {PROGRESS_FILE}.") + return {} + +def save_progress(tag, page): + """Save progress data to file. + + Args: + tag (str): The tag being processed. + page (str or int): The current page number or status. + """ + progress = load_progress() + progress[tag] = page + with open(PROGRESS_FILE, 'w') as f: + json.dump(progress, f) + +def load_processed_question_ids(): + """Load processed question IDs from file. + + Returns: + set: Set of processed question IDs. + """ + try: + if os.path.getsize(PROCESSED_IDS_FILE) == 0: + return set() + with open(PROCESSED_IDS_FILE, 'r') as f: + return set(json.load(f)) + except FileNotFoundError: + return set() + except json.JSONDecodeError: + return set() + +def save_processed_question_ids(processed_ids): + """Save processed question IDs to file. + + Args: + processed_ids (set): Set of processed question IDs. + """ + with open(PROCESSED_IDS_FILE, 'w') as f: + json.dump(list(processed_ids), f) + +def load_tags(): + """Load tags from the JSON file if it's not older than the update interval, otherwise from the YAML file. + + Returns: + list: List of tags. + """ + if os.path.exists(TAGS_FILE): + with open(TAGS_FILE, 'r') as f: + tags_data = json.load(f) + last_update = datetime.strptime(tags_data['last_update'], "%Y-%m-%d") + if datetime.now() - last_update < timedelta(days=TAGS_UPDATE_INTERVAL): + return tags_data['tags'] + + # If the JSON file doesn't exist or is older than the update interval, load from YAML + with open("sourcesl/andscape_augmented.yml", 'r') as f: + data = yaml.safe_load(f) + + tags = [] + # Initialize a dictionary to save tags corresponding to each file + tags_dict = {'Project_name': ""} + # Process the loaded data + for category in data['landscape']: + category_list = ["App Definition and Development", "Orchestration & Management", "Runtime", \ + "Provisioning", "Observability and Analysis", "Test_Provisioning"] + if category['name'] not in category_list: + continue + tags_dict['Category'] = category['name'] + for subcategory in category.get('subcategories', []): + for item in subcategory.get('items', []): + project_name = item['name'].split('(')[0].strip() + tags_dict['Project_name'] = project_name + tags.append(tags_dict['Project_name']) + + # Save the tags to the JSON file with the current date + tags_data = { + 'tags': tags, + 'last_update': datetime.now().strftime("%Y-%m-%d") + } + with open(TAGS_FILE, 'w') as f: + json.dump(tags_data, f) + + return tags + +if __name__ == "__main__": + tags = load_tags() + request_count = 0 + # Extract and save QA pairs incrementally + extract_all_projects(tags, request_count)